Fujitsu Sparc64 acceleration and general fixes for non-x86 builds
authorErik Lindahl <erik@kth.se>
Sun, 6 Jan 2013 13:50:23 +0000 (14:50 +0100)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Tue, 21 May 2013 12:51:50 +0000 (14:51 +0200)
Fixes configurations not to assume x86 and avoid warnings, in
particular if a non-x86 acceleration is used.
The cpu detection code has been extended to parse /proc/cpuinfo on
Linux in cases where the x86 CPUID instruction (or the inline
assembly to execute it) is not available.
Finally, there are new group kernels accelerated for use on the
K computer, which uses the Sparc64 HPC-ACE instruction set. These
kernels are roughly ~35% faster than the compiled C version, which
means Gromacs-4.6 is now ~70% faster on K than Gromacs-4.5.

Change-Id: I92559f0ac6159b504f100447a41a03e4b33fec19

120 files changed:
CMakeLists.txt
cmake/Toolchain-Fujitsu-Sparc64-mpi.cmake [new file with mode: 0644]
cmake/Toolchain-Fujitsu-Sparc64.cmake [new file with mode: 0644]
include/gmx_cpuid.h
src/config.h.cmakein
src/gmxlib/gmx_cpuid.c
src/gmxlib/nonbonded/CMakeLists.txt
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/kernelutil_sparc64_hpc_ace_double.h [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/make_nb_kernel_sparc64_hpc_ace_double.py [new file with mode: 0755]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.c [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h [new file with mode: 0644]
src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_template_sparc64_hpc_ace_double.pre [new file with mode: 0644]
src/gmxlib/nonbonded/nonbonded.c

index 66da0e24535c182a6215f979008f13f35deac099..a62a3a4efa9d05f439683306429bb65ee91ddb29 100644 (file)
@@ -183,7 +183,7 @@ if(NOT DEFINED GMX_CPU_ACCELERATION)
 endif(NOT DEFINED GMX_CPU_ACCELERATION)
 
 set(GMX_CPU_ACCELERATION "@GMX_SUGGESTED_CPU_ACCELERATION@"
-    CACHE STRING "Accelerated CPU kernels. Pick one of: None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, IBM_QPX")
+    CACHE STRING "Accelerated CPU kernels. Pick one of: None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, IBM_QPX, Sparc64_HPC_ACE")
 
 set(GMX_FFT_LIBRARY "fftw3" 
     CACHE STRING "FFT library choices: fftw3,mkl,fftpack[built-in]")
@@ -874,9 +874,10 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "IBM_QPX")
     else()
         message(FATAL_ERROR "Cannot compile IBM QPX intrinsics without the XL compiler. If you are compiling for BlueGene/Q, use 'cmake .. -DCMAKE_TOOLCHAIN_FILE=BlueGeneQ-static-XL-C' to set up the tool chain.")
     endif()
-
+elseif(${GMX_CPU_ACCELERATION} STREQUAL "SPARC64_HPC_ACE")
+    set(GMX_CPU_ACCELERATION_SPARC64_HPC_ACE 1)
 else(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
-    MESSAGE(FATAL_ERROR "Unrecognized option for accelerated kernels: ${GMX_CPU_ACCELERATION}. Pick one of None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, IBM_QPX")
+    MESSAGE(FATAL_ERROR "Unrecognized option for accelerated kernels: ${GMX_CPU_ACCELERATION}. Pick one of None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, IBM_QPX, Sparc64_HPC_ACE")
 endif(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
 set(ACCELERATION_QUIETLY TRUE CACHE INTERNAL "")
 
@@ -928,11 +929,11 @@ if(${GMX_FFT_LIBRARY} STREQUAL "FFTW3")
 
     set(GMX_FFT_FFTW3 1)
 
-    if (NOT ${GMX_CPU_ACCELERATION} STREQUAL "NONE" AND NOT ${FFTW}_HAVE_SIMD) 
+    if ((${GMX_CPU_ACCELERATION} MATCHES "SSE" OR ${GMX_CPU_ACCELERATION} MATCHES "AVX") AND NOT ${FFTW}_HAVE_SIMD)
       message(WARNING "The fftw library found is compiled without SIMD support, which makes it slow. Consider recompiling it or contact your admin")
     endif()
 
-    if(NOT ${GMX_CPU_ACCELERATION} STREQUAL "NONE" AND ${FFTW}_HAVE_AVX)
+    if((${GMX_CPU_ACCELERATION} MATCHES "SSE" OR ${GMX_CPU_ACCELERATION} MATCHES "AVX") AND ${FFTW}_HAVE_AVX)
         # If we're not doing CPU acceleration, we don't care about FFTW performance on x86 either
         message(WARNING "The FFTW library was compiled with --enable-avx to enable AVX SIMD instructions. That might sound like a good idea for your processor, but for FFTW versions up to 3.3.3, these are slower than the SSE/SSE2 SIMD instructions for the way GROMACS uses FFTs. Limitations in the way FFTW allows GROMACS to measure performance make it awkward for either GROMACS or FFTW to make the decision for you based on runtime performance. You should compile a different FFTW library with --enable-sse or --enable-sse2. If you have a more recent FFTW, you may like to compare the performance of GROMACS with FFTW libraries compiled with and without --enable-avx. However, the GROMACS developers do not really expect the FFTW AVX optimization to help, because the performance is limited by memory access, not computation.")
     endif()
diff --git a/cmake/Toolchain-Fujitsu-Sparc64-mpi.cmake b/cmake/Toolchain-Fujitsu-Sparc64-mpi.cmake
new file mode 100644 (file)
index 0000000..14c58b9
--- /dev/null
@@ -0,0 +1,55 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012, by the GROMACS development team, led by
+# David van der Spoel, Berk Hess, Erik Lindahl, and including many
+# others, as listed in the AUTHORS file in the top-level source
+# directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+#
+# the name of the target operating system
+set(CMAKE_SYSTEM_NAME Linux CACHE STRING "Cross-compiling for Fujitsu Sparc64")
+
+set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE)
+
+# set the compiler
+set(CMAKE_C_COMPILER mpifccpx)
+set(CMAKE_CXX_COMPILER mpiFCCpx)
+set(CMAKE_C_COMPILER_ID "Fujitsu" CACHE STRING "Prevent CMake from adding GNU-specific linker flags (-rdynamic)" FORCE)
+
+set(CMAKE_C_FLAGS "-Kopenmp -Kfast,reduction,swp,simd=2,uxsimd -x500 -Xg -DGMX_RELAXED_DOUBLE_PRECISION -w" CACHE STRING "Fujitsu Sparc64 C Flags" FORCE)
+set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "Fujitsu Sparc64 C++ Flags" FORCE)
+set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Use native 1.0/sqrt(x) on Fujitsu Sparc64" FORCE)
+
+set(GMX_THREAD_MPI OFF CACHE BOOL "Use real MPI instead" FORCE)
+set(GMX_MPI ON CACHE BOOL "Use MPI library" FORCE)
+set(GMX_DOUBLE ON CACHE BOOL "Use double by default on Fujitsu Sparc64 (due to HPC-ACE)" FORCE)
+set(GMX_GPU OFF CACHE BOOL "Cannot do GPU acceleration on Fujitsu Sparc64" FORCE)
+set(BUILD_SHARED_LIBS OFF CACHE BOOL "Use static linking by default on Fujitsu Sparc64" FORCE)
+
+set(GMX_CPU_ACCELERATION "Sparc64_HPC_ACE" CACHE STRING "Enabling Sparc64 HPC-ACE acceleration when using Fujitsu Sparc64 toolchain")
diff --git a/cmake/Toolchain-Fujitsu-Sparc64.cmake b/cmake/Toolchain-Fujitsu-Sparc64.cmake
new file mode 100644 (file)
index 0000000..c76c4d9
--- /dev/null
@@ -0,0 +1,54 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012, by the GROMACS development team, led by
+# David van der Spoel, Berk Hess, Erik Lindahl, and including many
+# others, as listed in the AUTHORS file in the top-level source
+# directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+#
+# the name of the target operating system
+set(CMAKE_SYSTEM_NAME Linux CACHE STRING "Cross-compiling for Fujitsu Sparc64")
+
+set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE)
+
+# set the compiler
+set(CMAKE_C_COMPILER fccpx)
+set(CMAKE_CXX_COMPILER FCCpx)
+set(CMAKE_C_COMPILER_ID "Fujitsu" CACHE STRING "Prevent CMake from adding GNU-specific linker flags (-rdynamic)" FORCE)
+
+set(CMAKE_C_FLAGS "-Kopenmp -Kfast,reduction,swp,simd=2,uxsimd -x500 -Xg -DGMX_RELAXED_DOUBLE_PRECISION -w" CACHE STRING "Fujitsu Sparc64 C Flags" FORCE)
+set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "Fujitsu Sparc64 C++ Flags" FORCE)
+set(GMX_SOFTWARE_INVSQRT OFF CACHE BOOL "Use native 1.0/sqrt(x) on Fujitsu Sparc64" FORCE)
+
+# By default CMake will use thread-mpi
+set(GMX_DOUBLE ON CACHE BOOL "Use double by default on Fujitsu Sparc64 (due to HPC-ACE)" FORCE)
+set(GMX_GPU OFF CACHE BOOL "Cannot do GPU acceleration on Fujitsu Sparc64" FORCE)
+set(BUILD_SHARED_LIBS OFF CACHE BOOL "Use static linking by default on Fujitsu Sparc64" FORCE)
+
+set(GMX_CPU_ACCELERATION "Sparc64_HPC_ACE" CACHE STRING "Enabling Sparc64 HPC-ACE acceleration when using Fujitsu Sparc64 toolchain")
index 3b5beb65ed95be5857b6791d22798f49a3db232d..8d3a968c46367e4ec125cca88f1e25b1504ecc13 100644 (file)
@@ -54,6 +54,8 @@ enum gmx_cpuid_vendor
     GMX_CPUID_VENDOR_UNKNOWN,
     GMX_CPUID_VENDOR_INTEL,
     GMX_CPUID_VENDOR_AMD,
+    GMX_CPUID_VENDOR_FUJITSU,
+    GMX_CPUID_VENDOR_IBM,
     GMX_CPUID_NVENDORS
 };
 
@@ -127,6 +129,7 @@ enum gmx_cpuid_acceleration
     GMX_CPUID_ACCELERATION_X86_SSE4_1,
     GMX_CPUID_ACCELERATION_X86_AVX_128_FMA,
     GMX_CPUID_ACCELERATION_X86_AVX_256,
+    GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE,
     GMX_CPUID_NACCELERATIONS
 };
 
index 2a632cbd0db64ae65bac172165878a8d83793004..ffa22a1fab7bae5ea8f31b33d3ce167d8c5ad964 100644 (file)
 /* IBM QPX was selected as CPU acceleration type (e.g. BlueGene/Q) */
 #cmakedefine GMX_CPU_ACCELERATION_IBM_QPX
 
+/* Fujitsu Sparc64 HPC-ACE SIMD acceleration */
+#cmakedefine GMX_CPU_ACCELERATION_SPARC64_HPC_ACE
+
 /* String for CPU acceleration choice (for writing to log files and stdout) */
 #define GMX_CPU_ACCELERATION_STRING "@GMX_CPU_ACCELERATION@"
 
index 402af15f41d64d1070f057d9d2b8fa36919976ef..c17e8097185289fa00c0ab9ccd96aa6f85ebf725 100644 (file)
  * in a single file, but to avoid repeated ifdefs we set the overall architecture here.
  */
 #if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
+/* OK, it is x86, but can we execute cpuid? */
+#if defined(GMX_X86_GCC_INLINE_ASM) || ( defined(_MSC_VER) && ( (_MSC_VER > 1500) || (_MSC_VER==1500 & _MSC_FULL_VER >= 150030729)))
 #    define GMX_CPUID_X86
 #endif
+#endif
 
 /* Global constant character strings corresponding to our enumerated types */
 const char *
@@ -74,7 +77,9 @@ gmx_cpuid_vendor_string[GMX_CPUID_NVENDORS] =
     "CannotDetect",
     "Unknown",
     "GenuineIntel",
-    "AuthenticAMD"
+    "AuthenticAMD",
+    "Fujitsu",
+    "IBM"
 };
 
 const char *
@@ -125,7 +130,8 @@ gmx_cpuid_acceleration_string[GMX_CPUID_NACCELERATIONS] =
     "SSE2",
     "SSE4.1",
     "AVX_128_FMA",
-    "AVX_256"
+    "AVX_256",
+    "Sparc64 HPC-ACE"
 };
 
 /* Max length of brand string */
@@ -223,6 +229,10 @@ enum gmx_cpuid_acceleration
 static const
 enum gmx_cpuid_acceleration
     compiled_acc = GMX_CPUID_ACCELERATION_X86_SSE2;
+#elif defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE
+static const
+enum gmx_cpuid_acceleration
+    compiled_acc = GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE;
 #else
 static const
 enum gmx_cpuid_acceleration
@@ -696,6 +706,48 @@ cpuid_check_intel_x86(gmx_cpuid_t                cpuid)
 
 
 
+
+static void
+chomp_substring_before_colon(const char *in, char *s, int maxlength)
+{
+    char *p;
+    strncpy(s,in,maxlength);
+    p = strchr(s,':');
+    if(p!=NULL)
+    {
+        *p='\0';
+        while(isspace(*(--p)) && (p>=s))
+        {
+            *p='\0';
+        }
+    }
+    else
+    {
+        *s='\0';
+    }
+}
+
+static void
+chomp_substring_after_colon(const char *in, char *s, int maxlength)
+{
+    char *p;
+    if( (p = strchr(in,':'))!=NULL)
+    {
+        p++;
+        while(isspace(*p)) p++;
+        strncpy(s,p,maxlength);
+        p = s+strlen(s);
+        while(isspace(*(--p)) && (p>=s))
+        {
+            *p='\0';
+        }
+    }
+    else
+    {
+        *s='\0';
+    }
+}
+
 /* Try to find the vendor of the current CPU, so we know what specific
  * detection routine to call.
  */
@@ -706,6 +758,8 @@ cpuid_check_vendor(void)
     /* Register data used on x86 */
     unsigned int               eax, ebx, ecx, edx;
     char                       vendorstring[13];
+    FILE *                     fp;
+    char                       buffer[255],buffer2[255];
 
     /* Set default first */
     vendor = GMX_CPUID_VENDOR_UNKNOWN;
@@ -726,6 +780,29 @@ cpuid_check_vendor(void)
             vendor = i;
         }
     }
+#elif defined(__linux__) || defined(__linux)
+    /* General Linux. Try to get CPU vendor from /proc/cpuinfo */
+    if( (fp = fopen("/proc/cpuinfo","r")) != NULL)
+    {
+        while( (vendor == GMX_CPUID_VENDOR_UNKNOWN) && (fgets(buffer,sizeof(buffer),fp) != NULL))
+        {
+            chomp_substring_before_colon(buffer,buffer2,sizeof(buffer2));
+            /* Intel/AMD use "vendor_id", IBM "vendor". Fujitsu "manufacture". Add others if you have them! */
+            if( !strcmp(buffer2,"vendor_id") || !strcmp(buffer2,"vendor") || !strcmp(buffer2,"manufacture") )
+            {
+                chomp_substring_after_colon(buffer,buffer2,sizeof(buffer2));
+                for(i=GMX_CPUID_VENDOR_UNKNOWN; i<GMX_CPUID_NVENDORS; i++)
+                {
+                    /* Be liberal and accept if we find the vendor anywhere in string */
+                    if(strstr(buffer2,gmx_cpuid_vendor_string[i]))
+                    {
+                        vendor = i;
+                    }
+                }
+            }
+        }
+    }
+    fclose(fp);
 #else
     vendor = GMX_CPUID_VENDOR_UNKNOWN;
 #endif
@@ -794,6 +871,9 @@ gmx_cpuid_init               (gmx_cpuid_t *              pcpuid)
 {
     gmx_cpuid_t cpuid;
     int         i;
+    FILE *      fp;
+    char        buffer[255],buffer2[255];
+    int         found_brand;
 
     cpuid = malloc(sizeof(*cpuid));
 
@@ -803,6 +883,7 @@ gmx_cpuid_init               (gmx_cpuid_t *              pcpuid)
     {
         cpuid->feature[i] = 0;
     }
+
     cpuid->have_cpu_topology   = 0;
     cpuid->nproc               = 0;
     cpuid->npackages           = 0;
@@ -826,20 +907,37 @@ gmx_cpuid_init               (gmx_cpuid_t *              pcpuid)
             break;
 #endif
         default:
-            /* Could not find vendor */
-            strncpy(cpuid->brand, "Unknown CPU brand", GMX_CPUID_BRAND_MAXLEN);
+            /* Default value */
+            strncpy(cpuid->brand,"Unknown CPU brand",GMX_CPUID_BRAND_MAXLEN);
+#if defined(__linux__) || defined(__linux)
+            /* General Linux. Try to get CPU type from /proc/cpuinfo */
+            if( (fp = fopen("/proc/cpuinfo","r")) != NULL)
+            {
+                found_brand = 0;
+                while( (found_brand==0) && (fgets(buffer,sizeof(buffer),fp) !=NULL))
+                {
+                    chomp_substring_before_colon(buffer,buffer2,sizeof(buffer2));
+                    /* Intel uses "model name", Fujitsu and IBM "cpu". */
+                    if( !strcmp(buffer2,"model name") || !strcmp(buffer2,"cpu"))
+                    {
+                        chomp_substring_after_colon(buffer,cpuid->brand,GMX_CPUID_BRAND_MAXLEN);
+                        found_brand = 1;
+                    }
+                }
+            }
+            fclose(fp);
+#endif
             cpuid->family         = 0;
             cpuid->model          = 0;
             cpuid->stepping       = 0;
-
-            for (i = 0; i < GMX_CPUID_NFEATURES; i++)
+            
+            for(i=0; i<GMX_CPUID_NFEATURES; i++)
             {
-                cpuid->feature[i] = 0;
+                cpuid->feature[i]=0;
             }
             cpuid->feature[GMX_CPUID_FEATURE_CANNOTDETECT] = 1;
             break;
     }
-
     return 0;
 }
 
@@ -950,7 +1048,13 @@ gmx_cpuid_acceleration_suggest  (gmx_cpuid_t                 cpuid)
             tmpacc = GMX_CPUID_ACCELERATION_X86_SSE2;
         }
     }
-
+    else if(gmx_cpuid_vendor(cpuid)==GMX_CPUID_VENDOR_FUJITSU)
+    {
+        if(strstr(gmx_cpuid_brand(cpuid),"SPARC64"))
+        {
+            tmpacc = GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE;
+        }
+    }
     return tmpacc;
 }
 
@@ -1001,7 +1105,6 @@ gmx_cpuid_acceleration_check(gmx_cpuid_t   cpuid,
 }
 
 
-
 #ifdef GMX_CPUID_STANDALONE
 /* Stand-alone program to enable queries of CPU features from Cmake.
  * Note that you need to check inline ASM capabilities before compiling and set
index 02dc83e4015fa702ca56415e4f4c8d23b9187a8d..e14de7066619615e3942d2d759897d0f85d8a136 100644 (file)
 # Sources that should always be built
 file(GLOB NONBONDED_SOURCES *.c nb_kernel_c/*.c)
 
-if(GMX_CPU_ACCELERATION STREQUAL "SSE2" AND NOT GMX_DOUBLE)
+if("${GMX_CPU_ACCELERATION}" STREQUAL "SSE2" AND NOT GMX_DOUBLE)
     file(GLOB NONBONDED_SSE2_SINGLE_SOURCES nb_kernel_sse2_single/*.c)
 endif()
 
-if(GMX_CPU_ACCELERATION STREQUAL "SSE4.1" AND NOT GMX_DOUBLE)
+if("${GMX_CPU_ACCELERATION}" STREQUAL "SSE4.1" AND NOT GMX_DOUBLE)
     file(GLOB NONBONDED_SSE4_1_SINGLE_SOURCES nb_kernel_sse4_1_single/*.c)
 endif()
 
-if(GMX_CPU_ACCELERATION STREQUAL "AVX_128_FMA" AND NOT GMX_DOUBLE)
+if("${GMX_CPU_ACCELERATION}" STREQUAL "AVX_128_FMA" AND NOT GMX_DOUBLE)
     file(GLOB NONBONDED_AVX_128_FMA_SINGLE_SOURCES nb_kernel_avx_128_fma_single/*.c)
 endif()
 
-if(GMX_CPU_ACCELERATION STREQUAL "AVX_256" AND NOT GMX_DOUBLE)
+if("${GMX_CPU_ACCELERATION}" STREQUAL "AVX_256" AND NOT GMX_DOUBLE)
     file(GLOB NONBONDED_AVX_256_SINGLE_SOURCES nb_kernel_avx_256_single/*.c)
 endif()
 
-if(GMX_CPU_ACCELERATION STREQUAL "SSE2" AND GMX_DOUBLE)
+if("${GMX_CPU_ACCELERATION}" STREQUAL "SSE2" AND GMX_DOUBLE)
     file(GLOB NONBONDED_SSE2_DOUBLE_SOURCES nb_kernel_sse2_double/*.c)
 endif()
 
-if(GMX_CPU_ACCELERATION STREQUAL "SSE4.1" AND GMX_DOUBLE)
+if("${GMX_CPU_ACCELERATION}" STREQUAL "SSE4.1" AND GMX_DOUBLE)
     file(GLOB NONBONDED_SSE4_1_DOUBLE_SOURCES nb_kernel_sse4_1_double/*.c)
 endif()
 
-if(GMX_CPU_ACCELERATION STREQUAL "AVX_128_FMA" AND GMX_DOUBLE)
+if("${GMX_CPU_ACCELERATION}" STREQUAL "AVX_128_FMA" AND GMX_DOUBLE)
     file(GLOB NONBONDED_AVX_128_FMA_DOUBLE_SOURCES nb_kernel_avx_128_fma_double/*.c)
 endif()
 
-if(GMX_CPU_ACCELERATION STREQUAL "AVX_256" AND GMX_DOUBLE)
+if("${GMX_CPU_ACCELERATION}" STREQUAL "AVX_256" AND GMX_DOUBLE)
     file(GLOB NONBONDED_AVX_256_DOUBLE_SOURCES nb_kernel_avx_256_double/*.c)
 endif()
 
+if("${GMX_CPU_ACCELERATION}" STREQUAL "Sparc64_HPC_ACE" AND GMX_DOUBLE)
+    file(GLOB NONBONDED_SPARC64_HPC_ACE_DOUBLE_SOURCES nb_kernel_sparc64_hpc_ace_double/*.c)
+endif()
+
+
 # These sources will be used in the parent directory's CMakeLists.txt
-set(NONBONDED_SOURCES ${NONBONDED_SOURCES} ${NONBONDED_SSE2_SINGLE_SOURCES} ${NONBONDED_SSE4_1_SINGLE_SOURCES} ${NONBONDED_AVX_128_FMA_SINGLE_SOURCES} ${NONBONDED_AVX_256_SINGLE_SOURCES} ${NONBONDED_SSE2_DOUBLE_SOURCES} ${NONBONDED_SSE4_1_DOUBLE_SOURCES} ${NONBONDED_AVX_128_FMA_DOUBLE_SOURCES} ${NONBONDED_AVX_256_DOUBLE_SOURCES} PARENT_SCOPE)
+set(NONBONDED_SOURCES ${NONBONDED_SOURCES} ${NONBONDED_SSE2_SINGLE_SOURCES} ${NONBONDED_SSE4_1_SINGLE_SOURCES} ${NONBONDED_AVX_128_FMA_SINGLE_SOURCES} ${NONBONDED_AVX_256_SINGLE_SOURCES} ${NONBONDED_SSE2_DOUBLE_SOURCES} ${NONBONDED_SSE4_1_DOUBLE_SOURCES} ${NONBONDED_AVX_128_FMA_DOUBLE_SOURCES} ${NONBONDED_AVX_256_DOUBLE_SOURCES} ${NONBONDED_SPARC64_HPC_ACE_DOUBLE_SOURCES} PARENT_SCOPE)
 
 
 
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/kernelutil_sparc64_hpc_ace_double.h b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/kernelutil_sparc64_hpc_ace_double.h
new file mode 100644 (file)
index 0000000..dfd3839
--- /dev/null
@@ -0,0 +1,945 @@
+/*
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2011-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under 
+ * the terms of the GNU Lesser General Public License as published by the Free 
+ * Software Foundation; either version 2 of the License, or (at your option) any 
+ * later version.
+ * As a special exception, you may use this file as part of a free software
+ * library without restriction.  Specifically, if other files instantiate
+ * templates or use macros or inline functions from this file, or you compile
+ * this file and link it with other files to produce an executable, this
+ * file does not by itself cause the resulting executable to be covered by
+ * the GNU Lesser General Public License.
+ *
+ * In plain-speak: do not worry about classes/macros/templates either - only
+ * changes to the library have to be LGPL, not an application linking with it.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website!
+ */
+#ifndef _kernelutil_sparc64_hpc_ace_double_h_
+#define _kernelutil_sparc64_hpc_ace_double_h_
+
+/* Fujitsu header borrows the name from SSE2, since some instructions have aliases */
+#include "emmintrin.h"
+
+#define GMX_FJSP_SHUFFLE2(x,y) (((x)<<1) | (y))
+
+#define GMX_FJSP_TRANSPOSE2_V2R8(row0, row1) {           \
+    _fjsp_v2r8 __gmx_t1 = row0;                          \
+    row0           = _fjsp_unpacklo_v2r8(row0,row1);     \
+    row1           = _fjsp_unpackhi_v2r8(__gmx_t1,row1); \
+}
+
+
+static void
+gmx_fjsp_print_v2r8(const char *s, _fjsp_v2r8 a)
+{
+  double lo,hi;
+
+  _fjsp_storel_v2r8(&lo,a);
+  _fjsp_storeh_v2r8(&hi,a);
+  printf("%s: %g %g\n",s,lo,hi);
+}
+
+
+static _fjsp_v2r8
+gmx_fjsp_set1_v2r8(double d)
+{
+    return _fjsp_set_v2r8(d,d);
+}
+
+static _fjsp_v2r8
+gmx_fjsp_load1_v2r8(const double * gmx_restrict ptr)
+{
+    return gmx_fjsp_set1_v2r8(*ptr);
+}
+
+
+static int
+gmx_fjsp_any_lt_v2r8(_fjsp_v2r8 a, _fjsp_v2r8 b)
+{
+    union
+    {
+        double           d;
+        long long int    i;
+    }
+    conv;
+    
+    a = _fjsp_cmplt_v2r8(a,b);
+    a = _fjsp_or_v2r8(a, _fjsp_unpackhi_v2r8(a,a));
+    _fjsp_storel_v2r8(&(conv.d),a);
+    return (conv.i != 0);
+}
+
+/* 1.0/sqrt(x) */
+static gmx_inline _fjsp_v2r8
+gmx_fjsp_invsqrt_v2r8(_fjsp_v2r8 x)
+{
+    const _fjsp_v2r8 half  = gmx_fjsp_set1_v2r8(0.5);
+    const _fjsp_v2r8 three = gmx_fjsp_set1_v2r8(3.0);
+    _fjsp_v2r8 lu = _fjsp_rsqrta_v2r8(x);
+    
+    lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
+    /* The HPC-ACE instruction set is only available in double precision, while
+     * single precision is typically sufficient for Gromacs. If you define 
+     * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson 
+     * iterations and accept 32bits of accuracy in 1.0/sqrt(x) and 1.0/x, rather than full 
+     * double precision (53 bits). This is still clearly higher than single precision (24 bits).
+     */
+#ifndef GMX_RELAXED_DOUBLE_PRECISION
+    lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
+#endif
+    return _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
+}
+
+
+/* 1.0/x */
+static gmx_inline _fjsp_v2r8
+gmx_fjsp_inv_v2r8(_fjsp_v2r8 x)
+{
+    const _fjsp_v2r8 two  = gmx_fjsp_set1_v2r8(2.0);    
+    __m128d lu = _fjsp_rcpa_v2r8(x);
+    
+    /* Perform three N-R steps for double precision */
+    lu         = _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
+    /* The HPC-ACE instruction set is only available in double precision, while
+     * single precision is typically sufficient for Gromacs. If you define
+     * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson
+     * iterations and accept 32bits of accuracy in 1.0/sqrt(x) and 1.0/x, rather than full
+     * double precision (53 bits). This is still clearly higher than single precision (24 bits).
+     */
+#ifndef GMX_RELAXED_DOUBLE_PRECISION
+    lu         = _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
+#endif
+    return _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
+}
+
+
+static gmx_inline _fjsp_v2r8
+gmx_fjsp_calc_rsq_v2r8(_fjsp_v2r8 dx, _fjsp_v2r8 dy, _fjsp_v2r8 dz)
+{
+    return _fjsp_madd_v2r8(dx,dx,_fjsp_madd_v2r8(dy,dy,_fjsp_mul_v2r8(dz,dz)));
+}
+
+/* Normal sum of four ymm registers */
+#define gmx_fjsp_sum4_v2r8(t0,t1,t2,t3)  _fjsp_add_v2r8(_fjsp_add_v2r8(t0,t1),_fjsp_add_v2r8(t2,t3))
+
+
+
+
+
+static _fjsp_v2r8
+gmx_fjsp_load_2real_swizzle_v2r8(const double * gmx_restrict ptrA,
+                                 const double * gmx_restrict ptrB)
+{
+    return _fjsp_unpacklo_v2r8(_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA),_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB));
+}
+
+static _fjsp_v2r8
+gmx_fjsp_load_1real_v2r8(const double * gmx_restrict ptrA)
+{
+    return _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
+}
+
+
+static void
+gmx_fjsp_store_2real_swizzle_v2r8(double * gmx_restrict ptrA,
+                                double * gmx_restrict ptrB,
+                                _fjsp_v2r8 xmm1)
+{
+    _fjsp_v2r8 t2;
+    
+    t2       = _fjsp_unpackhi_v2r8(xmm1,xmm1);
+    _fjsp_storel_v2r8(ptrA,xmm1);                                           
+    _fjsp_storel_v2r8(ptrB,t2);                                         
+}
+
+static void
+gmx_fjsp_store_1real_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 xmm1)
+{
+    _fjsp_storel_v2r8(ptrA,xmm1);
+}
+
+
+/* Similar to store, but increments value in memory */
+static void
+gmx_fjsp_increment_2real_swizzle_v2r8(double * gmx_restrict ptrA,
+                                    double * gmx_restrict ptrB, _fjsp_v2r8 xmm1)
+{
+    _fjsp_v2r8 t1;
+    
+    t1   = _fjsp_unpackhi_v2r8(xmm1,xmm1);
+    xmm1 = _fjsp_add_v2r8(xmm1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA));
+    t1   = _fjsp_add_v2r8(t1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB));
+    _fjsp_storel_v2r8(ptrA,xmm1);
+    _fjsp_storel_v2r8(ptrB,t1);
+}
+
+static void
+gmx_fjsp_increment_1real_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 xmm1)
+{
+    _fjsp_v2r8 tmp;
+    
+    tmp = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
+    tmp = _fjsp_add_v2r8(tmp,xmm1);
+    _fjsp_storel_v2r8(ptrA,tmp);
+}
+
+
+
+static gmx_inline void
+gmx_fjsp_load_2pair_swizzle_v2r8(const double * gmx_restrict p1,
+                             const double * gmx_restrict p2,
+                             _fjsp_v2r8 * gmx_restrict c6,
+                             _fjsp_v2r8 * gmx_restrict c12)
+{
+    _fjsp_v2r8 t1,t2,t3;
+    
+    /* The c6/c12 array should be aligned */
+    t1   = _fjsp_load_v2r8(p1);
+    t2   = _fjsp_load_v2r8(p2);
+    *c6  = _fjsp_unpacklo_v2r8(t1,t2);  
+    *c12 = _fjsp_unpackhi_v2r8(t1,t2);                    
+}
+
+static gmx_inline void
+gmx_fjsp_load_1pair_swizzle_v2r8(const double * gmx_restrict p1,
+                             _fjsp_v2r8 * gmx_restrict c6,
+                             _fjsp_v2r8 * gmx_restrict c12)
+{
+    *c6     = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
+    *c12    = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
+}
+
+
+static gmx_inline void
+gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
+                                         const double * gmx_restrict xyz,
+                                         _fjsp_v2r8 * gmx_restrict x1,
+                                         _fjsp_v2r8 * gmx_restrict y1,
+                                         _fjsp_v2r8 * gmx_restrict z1)
+{
+    _fjsp_v2r8 mem_xy,mem_z,mem_sxy,mem_sz;
+    
+    mem_xy  = _fjsp_load_v2r8(xyz);
+    mem_z   = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz+2);
+    mem_sxy = _fjsp_load_v2r8(xyz_shift);
+    mem_sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
+    
+    mem_xy  = _fjsp_add_v2r8(mem_xy,mem_sxy);
+    mem_z   = _fjsp_add_v2r8(mem_z,mem_sz);
+    
+    *x1  = _fjsp_shuffle_v2r8(mem_xy,mem_xy,GMX_FJSP_SHUFFLE2(0,0));
+    *y1  = _fjsp_shuffle_v2r8(mem_xy,mem_xy,GMX_FJSP_SHUFFLE2(1,1));
+    *z1  = _fjsp_shuffle_v2r8(mem_z,mem_z,GMX_FJSP_SHUFFLE2(0,0));
+}
+
+
+static gmx_inline void
+gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
+                                         const double * gmx_restrict xyz,
+                                         _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                         _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                         _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
+{
+    _fjsp_v2r8 t1,t2,t3,t4,t5,sxy,sz,szx,syz;
+    
+    t1  = _fjsp_load_v2r8(xyz);
+    t2  = _fjsp_load_v2r8(xyz+2);
+    t3  = _fjsp_load_v2r8(xyz+4);
+    t4  = _fjsp_load_v2r8(xyz+6);
+    t5  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz+8);
+    
+    sxy = _fjsp_load_v2r8(xyz_shift);
+    sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
+    szx = _fjsp_shuffle_v2r8(sz,sxy,GMX_FJSP_SHUFFLE2(0,0));
+    syz = _fjsp_shuffle_v2r8(sxy,sz,GMX_FJSP_SHUFFLE2(0,1));
+    
+    t1  = _fjsp_add_v2r8(t1,sxy);
+    t2  = _fjsp_add_v2r8(t2,szx);
+    t3  = _fjsp_add_v2r8(t3,syz);
+    t4  = _fjsp_add_v2r8(t4,sxy);
+    t5  = _fjsp_add_v2r8(t5,sz);
+    
+    *x1  = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(0,0));
+    *y1  = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(1,1));
+    *z1  = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(0,0));
+    *x2  = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(1,1));
+    *y2  = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(0,0));
+    *z2  = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(1,1));
+    *x3  = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(0,0));
+    *y3  = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(1,1));
+    *z3  = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(0,0));
+}
+
+
+static gmx_inline void
+gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
+                                         const double * gmx_restrict xyz,
+                                         _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                         _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                         _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
+                                         _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
+{
+    _fjsp_v2r8 t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
+    
+    t1  = _fjsp_load_v2r8(xyz);
+    t2  = _fjsp_load_v2r8(xyz+2);
+    t3  = _fjsp_load_v2r8(xyz+4);
+    t4  = _fjsp_load_v2r8(xyz+6);
+    t5  = _fjsp_load_v2r8(xyz+8);
+    t6  = _fjsp_load_v2r8(xyz+10);
+    
+    sxy = _fjsp_load_v2r8(xyz_shift);
+    sz  = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
+    szx = _fjsp_shuffle_v2r8(sz,sxy,GMX_FJSP_SHUFFLE2(0,0));
+    syz = _fjsp_shuffle_v2r8(sxy,sz,GMX_FJSP_SHUFFLE2(0,1));
+    
+    t1  = _fjsp_add_v2r8(t1,sxy);
+    t2  = _fjsp_add_v2r8(t2,szx);
+    t3  = _fjsp_add_v2r8(t3,syz);
+    t4  = _fjsp_add_v2r8(t4,sxy);
+    t5  = _fjsp_add_v2r8(t5,szx);
+    t6  = _fjsp_add_v2r8(t6,syz);
+    
+    *x1  = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(0,0));
+    *y1  = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(1,1));
+    *z1  = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(0,0));
+    *x2  = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(1,1));
+    *y2  = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(0,0));
+    *z2  = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(1,1));
+    *x3  = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(0,0));
+    *y3  = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(1,1));
+    *z3  = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(0,0));
+    *x4  = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(1,1));
+    *y4  = _fjsp_shuffle_v2r8(t6,t6,GMX_FJSP_SHUFFLE2(0,0));
+    *z4  = _fjsp_shuffle_v2r8(t6,t6,GMX_FJSP_SHUFFLE2(1,1));
+}
+
+
+
+static gmx_inline void
+gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
+                                  _fjsp_v2r8 * gmx_restrict x, _fjsp_v2r8 * gmx_restrict y, _fjsp_v2r8 * gmx_restrict z)
+{
+        *x            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
+     *y            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
+     *z            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
+}
+
+static gmx_inline void
+gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
+                                  _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                  _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                  _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
+{
+        *x1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
+     *y1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
+     *z1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
+        *x2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+3);
+     *y2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+4);
+     *z2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+5);
+        *x3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+6);
+     *y3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+7);
+     *z3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+8);
+}
+
+static gmx_inline void
+gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
+                                  _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                  _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                  _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
+                                  _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
+{
+    *x1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
+    *y1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
+    *z1            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
+    *x2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+3);
+    *y2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+4);
+    *z2            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+5);
+    *x3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+6);
+    *y3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+7);
+    *z3            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+8);
+    *x4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+9);
+    *y4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+10);
+    *z4            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+11);
+}
+
+
+static gmx_inline void
+gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA,
+                                  const double * gmx_restrict ptrB,
+                                  _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1)
+{
+    _fjsp_v2r8 t1,t2,t3,t4;
+    t1           = _fjsp_load_v2r8(ptrA);
+    t2           = _fjsp_load_v2r8(ptrB);
+    t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
+    t4           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
+    GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
+    *x1          = t1;
+    *y1          = t2;
+    *z1          = _fjsp_unpacklo_v2r8(t3,t4);
+}
+
+static gmx_inline void
+gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
+                                  _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                  _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                  _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
+{
+_fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
+    t1           = _fjsp_load_v2r8(ptrA);
+    t2           = _fjsp_load_v2r8(ptrB);
+    t3           = _fjsp_load_v2r8(ptrA+2);
+    t4           = _fjsp_load_v2r8(ptrB+2);
+    t5           = _fjsp_load_v2r8(ptrA+4);
+    t6           = _fjsp_load_v2r8(ptrB+4);
+    t7           = _fjsp_load_v2r8(ptrA+6);
+    t8           = _fjsp_load_v2r8(ptrB+6);
+    t9           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
+    t10          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+8);
+    GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
+    GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
+    GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
+    GMX_FJSP_TRANSPOSE2_V2R8(t7,t8);
+    *x1          = t1;
+    *y1          = t2;
+    *z1          = t3;
+    *x2          = t4;
+    *y2          = t5;
+    *z2          = t6;
+    *x3          = t7;
+    *y3          = t8;
+    *z3          = _fjsp_unpacklo_v2r8(t9,t10);
+}
+
+
+static gmx_inline void
+gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
+                                  _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
+                                  _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
+                                  _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
+                                  _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
+{
+    _fjsp_v2r8 t1,t2,t3,t4,t5,t6;
+    t1           = _fjsp_load_v2r8(ptrA);
+    t2           = _fjsp_load_v2r8(ptrB);
+    t3           = _fjsp_load_v2r8(ptrA+2);
+    t4           = _fjsp_load_v2r8(ptrB+2);
+    t5           = _fjsp_load_v2r8(ptrA+4);
+    t6           = _fjsp_load_v2r8(ptrB+4);
+    GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
+    GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
+    GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
+    *x1          = t1;
+    *y1          = t2;
+    *z1          = t3;
+    *x2          = t4;
+    *y2          = t5;
+    *z2          = t6;
+    t1           = _fjsp_load_v2r8(ptrA+6);
+    t2           = _fjsp_load_v2r8(ptrB+6);
+    t3           = _fjsp_load_v2r8(ptrA+8);
+    t4           = _fjsp_load_v2r8(ptrB+8);
+    t5           = _fjsp_load_v2r8(ptrA+10);
+    t6           = _fjsp_load_v2r8(ptrB+10);
+    GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
+    GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
+    GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
+    *x3          = t1;
+    *y3          = t2;
+    *z3          = t3;
+    *x4          = t4;
+    *y4          = t5;
+    *z4          = t6;
+}
+
+
+static void
+gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
+                                       _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
+{
+    _fjsp_v2r8 t1,t2,t3;
+    
+    t1           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
+    t2           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+1);
+    t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
+    
+    t1           = _fjsp_sub_v2r8(t1,x1);
+    t2           = _fjsp_sub_v2r8(t2,y1);
+    t3           = _fjsp_sub_v2r8(t3,z1);
+    _fjsp_storel_v2r8(ptrA,t1);
+    _fjsp_storel_v2r8(ptrA+1,t2);
+    _fjsp_storel_v2r8(ptrA+2,t3);
+}
+
+static void
+gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 fscal,
+                                          _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
+{
+  _fjsp_v2r8 t1,t2,t3;
+
+  t1           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
+  t2           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+1);
+  t3           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
+
+  t1           = _fjsp_nmsub_v2r8(fscal,dx1,t1);
+  t2           = _fjsp_nmsub_v2r8(fscal,dy1,t2);
+  t3           = _fjsp_nmsub_v2r8(fscal,dz1,t3);
+  _fjsp_storel_v2r8(ptrA,t1);
+  _fjsp_storel_v2r8(ptrA+1,t2);
+  _fjsp_storel_v2r8(ptrA+2,t3);
+}
+
+
+static void
+gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
+                                       _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
+                                       _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
+                                       _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3) 
+{
+    _fjsp_v2r8 t1,t2,t3,t4,t5;
+    
+    t1          = _fjsp_load_v2r8(ptrA);
+    t2          = _fjsp_load_v2r8(ptrA+2);
+    t3          = _fjsp_load_v2r8(ptrA+4);
+    t4          = _fjsp_load_v2r8(ptrA+6);
+    t5          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
+    
+    x1          = _fjsp_unpacklo_v2r8(x1,y1);
+    z1          = _fjsp_unpacklo_v2r8(z1,x2);
+    y2          = _fjsp_unpacklo_v2r8(y2,z2);
+    x3          = _fjsp_unpacklo_v2r8(x3,y3);
+    /* nothing to be done for z3 */
+    
+    t1          = _fjsp_sub_v2r8(t1,x1);
+    t2          = _fjsp_sub_v2r8(t2,z1);
+    t3          = _fjsp_sub_v2r8(t3,y2);
+    t4          = _fjsp_sub_v2r8(t4,x3);
+    t5          = _fjsp_sub_v2r8(t5,z3);
+    _fjsp_storel_v2r8(ptrA,t1);
+    _fjsp_storeh_v2r8(ptrA+1,t1);
+    _fjsp_storel_v2r8(ptrA+2,t2);
+    _fjsp_storeh_v2r8(ptrA+3,t2);
+    _fjsp_storel_v2r8(ptrA+4,t3);
+    _fjsp_storeh_v2r8(ptrA+5,t3);
+    _fjsp_storel_v2r8(ptrA+6,t4);
+    _fjsp_storeh_v2r8(ptrA+7,t4);
+    _fjsp_storel_v2r8(ptrA+8,t5);
+}
+
+
+static void
+gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
+                                       _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
+                                       _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
+                                       _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
+                                       _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4) 
+{
+    _fjsp_v2r8 t1,t2,t3,t4,t5,t6;
+    
+    t1          = _fjsp_load_v2r8(ptrA);
+    t2          = _fjsp_load_v2r8(ptrA+2);
+    t3          = _fjsp_load_v2r8(ptrA+4);
+    t4          = _fjsp_load_v2r8(ptrA+6);
+    t5          = _fjsp_load_v2r8(ptrA+8);
+    t6          = _fjsp_load_v2r8(ptrA+10);
+    
+    x1          = _fjsp_unpacklo_v2r8(x1,y1);
+    z1          = _fjsp_unpacklo_v2r8(z1,x2);
+    y2          = _fjsp_unpacklo_v2r8(y2,z2);
+    x3          = _fjsp_unpacklo_v2r8(x3,y3);
+    z3          = _fjsp_unpacklo_v2r8(z3,x4);
+    y4          = _fjsp_unpacklo_v2r8(y4,z4);
+    
+    _fjsp_storel_v2r8(ptrA,    _fjsp_sub_v2r8( t1,x1 ));
+    _fjsp_storeh_v2r8(ptrA+1,  _fjsp_sub_v2r8( t1,x1 ));
+    _fjsp_storel_v2r8(ptrA+2,  _fjsp_sub_v2r8( t2,z1 ));
+    _fjsp_storeh_v2r8(ptrA+3,  _fjsp_sub_v2r8( t2,z1 ));
+    _fjsp_storel_v2r8(ptrA+4,  _fjsp_sub_v2r8( t3,y2 ));
+    _fjsp_storeh_v2r8(ptrA+5,  _fjsp_sub_v2r8( t3,y2 ));
+    _fjsp_storel_v2r8(ptrA+6,  _fjsp_sub_v2r8( t4,x3 ));
+    _fjsp_storeh_v2r8(ptrA+7,  _fjsp_sub_v2r8( t4,x3 ));
+    _fjsp_storel_v2r8(ptrA+8,  _fjsp_sub_v2r8( t5,z3 ));
+    _fjsp_storeh_v2r8(ptrA+9,  _fjsp_sub_v2r8( t5,z3 ));
+    _fjsp_storel_v2r8(ptrA+10, _fjsp_sub_v2r8( t6,y4 ));
+    _fjsp_storeh_v2r8(ptrA+11, _fjsp_sub_v2r8( t6,y4 ));
+}
+
+static void
+gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
+                                          _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
+{
+  _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7;
+    
+  t1          = _fjsp_load_v2r8(ptrA);
+  t2          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
+  t3          = _fjsp_load_v2r8(ptrB);
+  t4          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
+    
+  t5          = _fjsp_unpacklo_v2r8(x1,y1);
+  t6          = _fjsp_unpackhi_v2r8(x1,y1);
+  t7          = _fjsp_unpackhi_v2r8(z1,z1);
+    
+  t1          = _fjsp_sub_v2r8(t1,t5);
+  t2          = _fjsp_sub_v2r8(t2,z1);
+    
+  t3          = _fjsp_sub_v2r8(t3,t6);
+  t4          = _fjsp_sub_v2r8(t4,t7);
+    
+  _fjsp_storel_v2r8(ptrA,t1);
+  _fjsp_storeh_v2r8(ptrA+1,t1);
+  _fjsp_storel_v2r8(ptrA+2,t2);
+  _fjsp_storel_v2r8(ptrB,t3);
+  _fjsp_storeh_v2r8(ptrB+1,t3);
+  _fjsp_storel_v2r8(ptrB+2,t4);
+}
+
+
+static void
+gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
+                                              _fjsp_v2r8 fscal, _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
+{
+  _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,fscalA,fscalB;
+    
+    t1          = _fjsp_load_v2r8(ptrA);
+    t2          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
+    t3          = _fjsp_load_v2r8(ptrB);
+    t4          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
+    fscalA      = _fjsp_unpacklo_v2r8(fscal,fscal);
+    fscalB      = _fjsp_unpackhi_v2r8(fscal,fscal);
+    
+    t5          = _fjsp_unpacklo_v2r8(dx1,dy1);
+    t6          = _fjsp_unpackhi_v2r8(dx1,dy1);
+    t7          = _fjsp_unpackhi_v2r8(dz1,dz1);
+    
+    t1          = _fjsp_nmsub_v2r8(fscalA,t5,t1);
+    t2          = _fjsp_nmsub_v2r8(fscalA,dz1,t2);
+    
+    t3          = _fjsp_nmsub_v2r8(fscalB,t6,t3);
+    t4          = _fjsp_nmsub_v2r8(fscalB,t7,t4);
+    
+    _fjsp_storel_v2r8(ptrA,t1);
+    _fjsp_storeh_v2r8(ptrA+1,t1);
+    _fjsp_storel_v2r8(ptrA+2,t2);
+    _fjsp_storel_v2r8(ptrB,t3);
+    _fjsp_storeh_v2r8(ptrB+1,t3);
+    _fjsp_storel_v2r8(ptrB+2,t4);
+}
+
+
+static void
+gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
+                                       _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
+                                       _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
+                                       _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3) 
+{
+    _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
+    _fjsp_v2r8 tA,tB,tC,tD,tE,tF,tG,tH,tI;
+    
+    t1          = _fjsp_load_v2r8(ptrA);
+    t2          = _fjsp_load_v2r8(ptrA+2);
+    t3          = _fjsp_load_v2r8(ptrA+4);
+    t4          = _fjsp_load_v2r8(ptrA+6);
+    t5          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
+    t6          = _fjsp_load_v2r8(ptrB);
+    t7          = _fjsp_load_v2r8(ptrB+2);
+    t8          = _fjsp_load_v2r8(ptrB+4);
+    t9          = _fjsp_load_v2r8(ptrB+6);
+    t10         = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+8);
+    
+    tA          = _fjsp_unpacklo_v2r8(x1,y1);
+    tB          = _fjsp_unpackhi_v2r8(x1,y1);
+    tC          = _fjsp_unpacklo_v2r8(z1,x2);
+    tD          = _fjsp_unpackhi_v2r8(z1,x2);
+    tE          = _fjsp_unpacklo_v2r8(y2,z2);
+    tF          = _fjsp_unpackhi_v2r8(y2,z2);
+    tG          = _fjsp_unpacklo_v2r8(x3,y3);
+    tH          = _fjsp_unpackhi_v2r8(x3,y3);
+    tI          = _fjsp_unpackhi_v2r8(z3,z3);
+    
+    t1          = _fjsp_sub_v2r8(t1,tA);
+    t2          = _fjsp_sub_v2r8(t2,tC);
+    t3          = _fjsp_sub_v2r8(t3,tE);
+    t4          = _fjsp_sub_v2r8(t4,tG);
+    t5          = _fjsp_sub_v2r8(t5,z3);
+    
+    t6          = _fjsp_sub_v2r8(t6,tB);
+    t7          = _fjsp_sub_v2r8(t7,tD);
+    t8          = _fjsp_sub_v2r8(t8,tF);
+    t9          = _fjsp_sub_v2r8(t9,tH);
+    t10         = _fjsp_sub_v2r8(t10,tI);
+    
+    _fjsp_storel_v2r8(ptrA,t1);
+    _fjsp_storeh_v2r8(ptrA+1,t1);
+    _fjsp_storel_v2r8(ptrA+2,t2);
+    _fjsp_storeh_v2r8(ptrA+3,t2);
+    _fjsp_storel_v2r8(ptrA+4,t3);
+    _fjsp_storeh_v2r8(ptrA+5,t3);
+    _fjsp_storel_v2r8(ptrA+6,t4);
+    _fjsp_storeh_v2r8(ptrA+7,t4);
+    _fjsp_storel_v2r8(ptrA+8,t5);
+    _fjsp_storel_v2r8(ptrB,t6);
+    _fjsp_storeh_v2r8(ptrB+1,t6);
+    _fjsp_storel_v2r8(ptrB+2,t7);
+    _fjsp_storeh_v2r8(ptrB+3,t7);
+    _fjsp_storel_v2r8(ptrB+4,t8);
+    _fjsp_storeh_v2r8(ptrB+5,t8);
+    _fjsp_storel_v2r8(ptrB+6,t9);
+    _fjsp_storeh_v2r8(ptrB+7,t9);
+    _fjsp_storel_v2r8(ptrB+8,t10);
+}
+
+
+static void
+gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
+                                       _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
+                                       _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
+                                       _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
+                                       _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4) 
+{
+    _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
+    _fjsp_v2r8 tA,tB,tC,tD,tE,tF,tG,tH,tI,tJ,tK,tL;
+    
+    t1          = _fjsp_load_v2r8(ptrA);
+    t2          = _fjsp_load_v2r8(ptrA+2);
+    t3          = _fjsp_load_v2r8(ptrA+4);
+    t4          = _fjsp_load_v2r8(ptrA+6);
+    t5          = _fjsp_load_v2r8(ptrA+8);
+    t6          = _fjsp_load_v2r8(ptrA+10);
+    t7          = _fjsp_load_v2r8(ptrB);
+    t8          = _fjsp_load_v2r8(ptrB+2);
+    t9          = _fjsp_load_v2r8(ptrB+4);
+    t10         = _fjsp_load_v2r8(ptrB+6);
+    t11         = _fjsp_load_v2r8(ptrB+8);
+    t12         = _fjsp_load_v2r8(ptrB+10);
+    
+    tA          = _fjsp_unpacklo_v2r8(x1,y1);
+    tB          = _fjsp_unpackhi_v2r8(x1,y1);
+    tC          = _fjsp_unpacklo_v2r8(z1,x2);
+    tD          = _fjsp_unpackhi_v2r8(z1,x2);
+    tE          = _fjsp_unpacklo_v2r8(y2,z2);
+    tF          = _fjsp_unpackhi_v2r8(y2,z2);
+    tG          = _fjsp_unpacklo_v2r8(x3,y3);
+    tH          = _fjsp_unpackhi_v2r8(x3,y3);
+    tI          = _fjsp_unpacklo_v2r8(z3,x4);
+    tJ          = _fjsp_unpackhi_v2r8(z3,x4);
+    tK          = _fjsp_unpacklo_v2r8(y4,z4);
+    tL          = _fjsp_unpackhi_v2r8(y4,z4);
+    
+    t1          = _fjsp_sub_v2r8(t1,tA);
+    t2          = _fjsp_sub_v2r8(t2,tC);
+    t3          = _fjsp_sub_v2r8(t3,tE);
+    t4          = _fjsp_sub_v2r8(t4,tG);
+    t5          = _fjsp_sub_v2r8(t5,tI);
+    t6          = _fjsp_sub_v2r8(t6,tK);
+    
+    t7          = _fjsp_sub_v2r8(t7,tB);
+    t8          = _fjsp_sub_v2r8(t8,tD);
+    t9          = _fjsp_sub_v2r8(t9,tF);
+    t10         = _fjsp_sub_v2r8(t10,tH);
+    t11         = _fjsp_sub_v2r8(t11,tJ);
+    t12         = _fjsp_sub_v2r8(t12,tL);
+    
+    _fjsp_storel_v2r8(ptrA,  t1);
+    _fjsp_storeh_v2r8(ptrA+1,t1);
+    _fjsp_storel_v2r8(ptrA+2,t2);
+    _fjsp_storeh_v2r8(ptrA+3,t2);
+    _fjsp_storel_v2r8(ptrA+4,t3);
+    _fjsp_storeh_v2r8(ptrA+5,t3);
+    _fjsp_storel_v2r8(ptrA+6,t4);
+    _fjsp_storeh_v2r8(ptrA+7,t4);
+    _fjsp_storel_v2r8(ptrA+8,t5);
+    _fjsp_storeh_v2r8(ptrA+9,t5);
+    _fjsp_storel_v2r8(ptrA+10,t6);
+    _fjsp_storeh_v2r8(ptrA+11,t6);
+    _fjsp_storel_v2r8(ptrB,  t7);
+    _fjsp_storeh_v2r8(ptrB+1,t7);
+    _fjsp_storel_v2r8(ptrB+2,t8);
+    _fjsp_storeh_v2r8(ptrB+3,t8);
+    _fjsp_storel_v2r8(ptrB+4,t9);
+    _fjsp_storeh_v2r8(ptrB+5,t9);
+    _fjsp_storel_v2r8(ptrB+6,t10);
+    _fjsp_storeh_v2r8(ptrB+7,t10);
+    _fjsp_storel_v2r8(ptrB+8,t11);
+    _fjsp_storeh_v2r8(ptrB+9,t11);
+    _fjsp_storel_v2r8(ptrB+10,t12);
+    _fjsp_storeh_v2r8(ptrB+11,t12);
+}
+
+
+
+static gmx_inline void
+gmx_fjsp_update_iforce_1atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
+                                      double * gmx_restrict fptr,
+                                      double * gmx_restrict fshiftptr)
+{
+    __m128d t1,t2,t3,t4;
+    
+    /* transpose data */
+    t1 = fix1;
+    fix1 = _fjsp_unpacklo_v2r8(fix1,fiy1); /* y0 x0 */
+    fiy1 = _fjsp_unpackhi_v2r8(t1,fiy1);   /* y1 x1 */
+    
+    fix1 = _fjsp_add_v2r8(fix1,fiy1);
+    fiz1 = _fjsp_add_v2r8( fiz1, _fjsp_unpackhi_v2r8(fiz1,fiz1 ));
+    
+    t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
+    _fjsp_storel_v2r8( fptr, t4 );
+    _fjsp_storeh_v2r8( fptr+1, t4 );
+    _fjsp_storel_v2r8( fptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fptr+2), fiz1 ));
+    
+    t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
+    _fjsp_storel_v2r8( fshiftptr, t4 );
+    _fjsp_storeh_v2r8( fshiftptr+1, t4 );
+    _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
+}
+
+static gmx_inline void
+gmx_fjsp_update_iforce_3atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
+                                      _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
+                                      _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
+                                      double * gmx_restrict fptr,
+                                      double * gmx_restrict fshiftptr)
+{
+    __m128d t1,t2,t3,t4,t5,t6;
+    
+    /* transpose data */
+    GMX_FJSP_TRANSPOSE2_V2R8(fix1,fiy1);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiz1,fix2);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiy2,fiz2);
+    t1 = fix3;
+    fix3 = _fjsp_unpacklo_v2r8(fix3,fiy3); /* y0 x0 */
+    fiy3 = _fjsp_unpackhi_v2r8(t1,fiy3);   /* y1 x1 */
+    
+    fix1 = _fjsp_add_v2r8(fix1,fiy1);
+    fiz1 = _fjsp_add_v2r8(fiz1,fix2);
+    fiy2 = _fjsp_add_v2r8(fiy2,fiz2);
+    
+    fix3 = _fjsp_add_v2r8(fix3,fiy3);
+    fiz3 = _fjsp_add_v2r8( fiz3, _fjsp_unpackhi_v2r8(fiz3,fiz3));
+    
+    t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
+    t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+2), fiz1 );
+    t5 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+4), fiy2 );
+    t6 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+6), fix3 );
+
+    _fjsp_storel_v2r8( fptr,   t3 );
+    _fjsp_storeh_v2r8( fptr+1, t3 );
+    _fjsp_storel_v2r8( fptr+2, t4 );
+    _fjsp_storeh_v2r8( fptr+3, t4 );
+    _fjsp_storel_v2r8( fptr+4, t5 );
+    _fjsp_storeh_v2r8( fptr+5, t5 );
+    _fjsp_storel_v2r8( fptr+6, t6 );
+    _fjsp_storeh_v2r8( fptr+7, t6 );
+    _fjsp_storel_v2r8( fptr+8, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fptr+8), fiz3 ));
+    
+    fix1 = _fjsp_add_v2r8(fix1,fix3);
+    t1   = _fjsp_shuffle_v2r8(fiz1,fiy2,GMX_FJSP_SHUFFLE2(0,1));
+    fix1 = _fjsp_add_v2r8(fix1,t1); /* x and y sums */
+    
+    t2   = _fjsp_shuffle_v2r8(fiy2,fiy2,GMX_FJSP_SHUFFLE2(1,1));
+    fiz1 = _fjsp_add_v2r8(fiz1,fiz3);
+    fiz1 = _fjsp_add_v2r8(fiz1,t2); /* z sum */
+    
+    t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
+    _fjsp_storel_v2r8( fshiftptr, t3 );
+    _fjsp_storeh_v2r8( fshiftptr+1, t3 );
+    _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
+}
+
+
+static gmx_inline void
+gmx_fjsp_update_iforce_4atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
+                                      _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
+                                      _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
+                                      _fjsp_v2r8 fix4, _fjsp_v2r8 fiy4, _fjsp_v2r8 fiz4,
+                                      double * gmx_restrict fptr,
+                                      double * gmx_restrict fshiftptr)
+{
+    __m128d t1,t2,t3,t4,t5,t6,t7,t8;
+    
+    /* transpose data */
+    GMX_FJSP_TRANSPOSE2_V2R8(fix1,fiy1);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiz1,fix2);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiy2,fiz2);
+    GMX_FJSP_TRANSPOSE2_V2R8(fix3,fiy3);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiz3,fix4);
+    GMX_FJSP_TRANSPOSE2_V2R8(fiy4,fiz4);
+    
+    fix1 = _fjsp_add_v2r8(fix1,fiy1);
+    fiz1 = _fjsp_add_v2r8(fiz1,fix2);
+    fiy2 = _fjsp_add_v2r8(fiy2,fiz2);
+    fix3 = _fjsp_add_v2r8(fix3,fiy3);
+    fiz3 = _fjsp_add_v2r8(fiz3,fix4);
+    fiy4 = _fjsp_add_v2r8(fiy4,fiz4);
+    
+    t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr),    fix1 );
+    t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+2),  fiz1 );
+    t5 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+4),  fiy2 );
+    t6 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+6),  fix3 );
+    t7 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+8),  fiz3 );
+    t8 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+10), fiy4 );
+    _fjsp_storel_v2r8( fptr,    t3 );
+    _fjsp_storeh_v2r8( fptr+1,  t3 );
+    _fjsp_storel_v2r8( fptr+2,  t4 );
+    _fjsp_storeh_v2r8( fptr+3,  t4 );
+    _fjsp_storel_v2r8( fptr+4,  t5 );
+    _fjsp_storeh_v2r8( fptr+5,  t5 );
+    _fjsp_storel_v2r8( fptr+6,  t6 );
+    _fjsp_storeh_v2r8( fptr+7,  t6 );
+    _fjsp_storel_v2r8( fptr+8,  t7 );
+    _fjsp_storeh_v2r8( fptr+9,  t7 );
+    _fjsp_storel_v2r8( fptr+10, t8 );
+    _fjsp_storeh_v2r8( fptr+11, t8 );
+
+    t1 = _fjsp_shuffle_v2r8(fiz1,fiy2,GMX_FJSP_SHUFFLE2(0,1));
+    fix1 = _fjsp_add_v2r8(fix1,t1);
+    t2 = _fjsp_shuffle_v2r8(fiz3,fiy4,GMX_FJSP_SHUFFLE2(0,1));
+    fix3 = _fjsp_add_v2r8(fix3,t2);
+    fix1 = _fjsp_add_v2r8(fix1,fix3); /* x and y sums */
+    
+    fiz1 = _fjsp_add_v2r8(fiz1, _fjsp_unpackhi_v2r8(fiy2,fiy2));
+    fiz3 = _fjsp_add_v2r8(fiz3, _fjsp_unpackhi_v2r8(fiy4,fiy4));
+    fiz1 = _fjsp_add_v2r8(fiz1,fiz3); /* z sum */
+    
+    t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
+    _fjsp_storel_v2r8( fshiftptr, t3 );
+    _fjsp_storeh_v2r8( fshiftptr+1, t3 );
+    _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
+}
+
+
+
+static gmx_inline void
+gmx_fjsp_update_1pot_v2r8(_fjsp_v2r8 pot1, double * gmx_restrict ptrA)
+{
+    pot1 = _fjsp_add_v2r8(pot1, _fjsp_unpackhi_v2r8(pot1,pot1));
+    _fjsp_storel_v2r8(ptrA,_fjsp_add_v2r8(pot1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA)));
+}
+
+static gmx_inline void
+gmx_fjsp_update_2pot_v2r8(_fjsp_v2r8 pot1, double * gmx_restrict ptrA,
+                      _fjsp_v2r8 pot2, double * gmx_restrict ptrB)
+{
+    GMX_FJSP_TRANSPOSE2_V2R8(pot1,pot2);
+    pot1 = _fjsp_add_v2r8(pot1,pot2);
+    pot2 = _fjsp_unpackhi_v2r8(pot1,pot1);
+    
+    _fjsp_storel_v2r8(ptrA,_fjsp_add_v2r8(pot1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA)));
+    _fjsp_storel_v2r8(ptrB,_fjsp_add_v2r8(pot2,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB)));
+}
+
+
+#endif /* _kernelutil_sparc64_hpc_ace_double_h_ */
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/make_nb_kernel_sparc64_hpc_ace_double.py b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/make_nb_kernel_sparc64_hpc_ace_double.py
new file mode 100755 (executable)
index 0000000..9b723bd
--- /dev/null
@@ -0,0 +1,538 @@
+#!/usr/bin/python
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012, by the GROMACS development team, led by
+# David van der Spoel, Berk Hess, Erik Lindahl, and including many
+# others, as listed in the AUTHORS file in the top-level source
+# directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org
+
+import sys
+import os
+sys.path.append ( "../preprocessor" )
+from gmxpreprocess import gmxpreprocess
+
+# "The happiest programs are programs that write other programs."
+#
+#
+# This script controls the generation of Gromacs nonbonded kernels.
+#
+# We no longer generate kernels on-the-fly, so this file is not run
+# during a Gromacs compile - only when we need to update the kernels (=rarely).
+#
+# To maximize performance, each combination of interactions in Gromacs
+# has a separate nonbonded kernel without conditionals in the code.
+# To avoid writing hundreds of different routines for each architecture,
+# we instead use a custom preprocessor so we can encode the conditionals
+# and expand for-loops (e.g, for water-water interactions)
+# from a general kernel template. While that file will contain quite a
+# few preprocessor directives, it is still an order of magnitude easier
+# to maintain than ~200 different kernels (not to mention it avoids bugs).
+#
+# To actually generate the kernels, this program iteratively calls the
+# preprocessor with different define settings corresponding to all
+# combinations of coulomb/van-der-Waals/geometry options.
+#
+# A main goal in the design was to make this new generator _general_. For
+# this reason we have used a lot of different fields to identify a particular
+# kernel and interaction. Basically, each kernel will have a name like
+#
+# nbkernel_ElecXX_VdwYY_GeomZZ_VF_QQ()
+#
+# Where XX/YY/ZZ/VF are strings to identify what the kernel computes.
+#
+# Elec/Vdw describe the type of interaction for electrostatics and van der Waals.
+# The geometry settings correspond e.g. to water-water or water-particle kernels,
+# and finally the VF setting is V,F,or VF depending on whether we calculate
+# only the potential, only the force, or both of them. The final string (QQ)
+# is the architecture/language/optimization of the kernel.
+#
+Arch       = 'sparc64_hpc_ace_double'
+
+# Explanation of the 'properties':
+#
+# It is cheap to compute r^2, and the kernels require various other functions of r for
+# different kinds of interaction. Depending on the needs of the kernel and the available
+# processor instructions, this will be done in different ways.
+#
+# 'rinv' means we need 1/r, which is calculated as 1/sqrt(r^2).
+# 'rinvsq' means we need 1/(r*r). This is calculated as rinv*rinv if we already did rinv, otherwise 1/r^2.
+# 'r' is similarly calculated as r^2*rinv when needed
+# 'table' means the interaction is tabulated, in which case we will calculate a table index before the interaction
+# 'shift' means the interaction will be modified by a constant to make it zero at the cutoff.
+# 'cutoff' means the interaction is set to 0.0 outside the cutoff
+#
+
+FileHeader = \
+'/*\n' \
+' * This file is part of the GROMACS molecular simulation package.\n' \
+' *\n' \
+' * Copyright (c) 2012, by the GROMACS development team, led by\n' \
+' * David van der Spoel, Berk Hess, Erik Lindahl, and including many\n' \
+' * others, as listed in the AUTHORS file in the top-level source\n' \
+' * directory and at http://www.gromacs.org.\n' \
+' *\n' \
+' * GROMACS is free software; you can redistribute it and/or\n' \
+' * modify it under the terms of the GNU Lesser General Public License\n' \
+' * as published by the Free Software Foundation; either version 2.1\n' \
+' * of the License, or (at your option) any later version.\n' \
+' *\n' \
+' * GROMACS is distributed in the hope that it will be useful,\n' \
+' * but WITHOUT ANY WARRANTY; without even the implied warranty of\n' \
+' * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n' \
+' * Lesser General Public License for more details.\n' \
+' *\n' \
+' * You should have received a copy of the GNU Lesser General Public\n' \
+' * License along with GROMACS; if not, see\n' \
+' * http://www.gnu.org/licenses, or write to the Free Software Foundation,\n' \
+' * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.\n' \
+' *\n' \
+' * If you want to redistribute modifications to GROMACS, please\n' \
+' * consider that scientific software is very special. Version\n' \
+' * control is crucial - bugs must be traceable. We will be happy to\n' \
+' * consider code for inclusion in the official distribution, but\n' \
+' * derived work must not be called official GROMACS. Details are found\n' \
+' * in the README & COPYING files - if they are missing, get the\n' \
+' * official version at http://www.gromacs.org.\n' \
+' *\n' \
+' * To help us fund GROMACS development, we humbly ask that you cite\n' \
+' * the research papers on the package. Check out http://www.gromacs.org.\n' \
+' */\n' \
+'/*\n' \
+' * Note: this file was generated by the GROMACS '+Arch+' kernel generator.\n' \
+' */\n'
+
+###############################################
+# ELECTROSTATICS
+# Interactions and flags for them
+###############################################
+ElectrostaticsList = {
+    'None'                    : [],
+    'Coulomb'                 : ['rinv','rinvsq'],
+    'ReactionField'           : ['rinv','rinvsq'],
+    'GeneralizedBorn'         : ['rinv','r'],
+    'CubicSplineTable'        : ['rinv','r','table'],
+    'Ewald'                   : ['rinv','rinvsq','r'],
+}
+
+
+###############################################
+# VAN DER WAALS
+# Interactions and flags for them
+###############################################
+VdwList = {
+    'None'                    : [],
+    'LennardJones'            : ['rinvsq'],
+#    'Buckingham'              : ['rinv','rinvsq','r'], # Disabled for sse4.1 to reduce number of kernels and simply the template
+    'CubicSplineTable'        : ['rinv','r','table'],
+}
+
+
+###############################################
+# MODIFIERS
+# Different ways to adjust/modify interactions to conserve energy
+###############################################
+ModifierList = {
+    'None'                    : [],
+    'ExactCutoff'             : ['exactcutoff'],        # Zero the interaction outside the cutoff, used for reaction-field-zero
+    'PotentialShift'          : ['shift','exactcutoff'],
+    'PotentialSwitch'         : ['rinv','r','switch','exactcutoff']
+}
+
+
+###############################################
+# GEOMETRY COMBINATIONS
+###############################################
+GeometryNameList = [
+    [ 'Particle' , 'Particle' ],
+    [ 'Water3'   , 'Particle' ],
+    [ 'Water3'   , 'Water3'   ],
+    [ 'Water4'   , 'Particle' ],
+    [ 'Water4'   , 'Water4'   ]
+]
+
+
+###############################################
+# POTENTIAL / FORCE
+###############################################
+VFList = [
+    'PotentialAndForce',
+# 'Potential',   # Not used yet
+    'Force'
+]
+
+
+###############################################
+# GEOMETRY PROPERTIES
+###############################################
+# Dictionaries with lists telling which interactions are present
+# 1,2,3 means particles 1,2,3 (but not 0) have electrostatics!
+GeometryElectrostatics = {
+    'Particle'  : [ 0 ],
+    'Particle2' : [ 0 , 1 ],
+    'Particle3' : [ 0 , 1 , 2 ],
+    'Particle4' : [ 0 , 1 , 2 , 3 ],
+    'Water3'    : [ 0 , 1 , 2 ],
+    'Water4'    : [ 1 , 2 , 3 ]
+}
+
+GeometryVdw = {
+    'Particle'  : [ 0 ],
+    'Particle2' : [ 0 , 1 ],
+    'Particle3' : [ 0 , 1 , 2 ],
+    'Particle4' : [ 0 , 1 , 2 , 3 ],
+    'Water3'    : [ 0 ],
+    'Water4'    : [ 0 ]
+}
+
+
+
+
+# Dictionary to abbreviate all strings (mixed from all the lists)
+Abbreviation = {
+    'None'                    : 'None',
+    'Coulomb'                 : 'Coul',
+    'Ewald'                   : 'Ew',
+    'ReactionField'           : 'RF',
+    'GeneralizedBorn'         : 'GB',
+    'CubicSplineTable'        : 'CSTab',
+    'LennardJones'            : 'LJ',
+    'Buckingham'              : 'Bham',
+    'PotentialShift'          : 'Sh',
+    'PotentialSwitch'         : 'Sw',
+    'ExactCutoff'             : 'Cut',
+    'PotentialAndForce'       : 'VF',
+    'Potential'               : 'V',
+    'Force'                   : 'F',
+    'Water3'                  : 'W3',
+    'Water4'                  : 'W4',
+    'Particle'                : 'P1',
+    'Particle2'               : 'P2',
+    'Particle3'               : 'P3',
+    'Particle4'               : 'P4'
+}
+
+
+###############################################
+# Functions
+###############################################
+
+# Return a string with the kernel name from current settings
+def MakeKernelFileName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom):
+    ElecStr = 'Elec' + Abbreviation[KernelElec]
+    if(KernelElecMod!='None'):
+        ElecStr = ElecStr + Abbreviation[KernelElecMod]
+    VdwStr  = 'Vdw'  + Abbreviation[KernelVdw]
+    if(KernelVdwMod!='None'):
+        VdwStr = VdwStr + Abbreviation[KernelVdwMod]
+    GeomStr = 'Geom' + Abbreviation[KernelGeom[0]] + Abbreviation[KernelGeom[1]]
+    return 'nb_kernel_' + ElecStr + '_' + VdwStr + '_' + GeomStr + '_' + Arch
+
+def MakeKernelName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF):
+    ElecStr = 'Elec' + Abbreviation[KernelElec]
+    if(KernelElecMod!='None'):
+        ElecStr = ElecStr + Abbreviation[KernelElecMod]
+    VdwStr  = 'Vdw'  + Abbreviation[KernelVdw]
+    if(KernelVdwMod!='None'):
+        VdwStr = VdwStr + Abbreviation[KernelVdwMod]
+    GeomStr = 'Geom' + Abbreviation[KernelGeom[0]] + Abbreviation[KernelGeom[1]]
+    VFStr   = Abbreviation[KernelVF]
+    return 'nb_kernel_' + ElecStr + '_' + VdwStr + '_' + GeomStr + '_' + VFStr + '_' + Arch
+
+# Return a string with a declaration to use for the kernel;
+# this will be a sequence of string combinations as well as the actual function name
+# Dont worry about field widths - that is just pretty-printing for the header!
+def MakeKernelDecl(KernelName,KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelOther,KernelVF):
+    KernelStr   = '\"'+KernelName+'\"'
+    ArchStr     = '\"'+Arch+'\"'
+    ElecStr     = '\"'+KernelElec+'\"'
+    ElecModStr  = '\"'+KernelElecMod+'\"'
+    VdwStr      = '\"'+KernelVdw+'\"'
+    VdwModStr   = '\"'+KernelVdwMod+'\"'
+    GeomStr     = '\"'+KernelGeom[0]+KernelGeom[1]+'\"'
+    OtherStr    = '\"'+KernelOther+'\"'
+    VFStr       = '\"'+KernelVF+'\"'
+
+    ThisSpec = ArchStr+', '+ElecStr+', '+ElecModStr+', '+VdwStr+', '+VdwModStr+', '+GeomStr+', '+OtherStr+', '+VFStr
+    ThisDecl = '    { '+KernelName+', '+KernelStr+', '+ThisSpec+' }'
+    return ThisDecl
+
+
+# Returns 1 if this kernel should be created, 0 if we should skip it
+# This routine is not critical - it is not the end of the world if we create more kernels,
+# but since the number is pretty large we save both space and compile-time by reducing it a bit.
+def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF):
+
+    # No need for kernels without interactions
+    if(KernelElec=='None' and KernelVdw=='None'):
+        return 0
+
+    # No need for modifiers without interactions
+    if((KernelElec=='None' and KernelElecMod!='None') or (KernelVdw=='None' and KernelVdwMod!='None')):
+        return 0
+
+    # No need for LJ-only water optimization, or water optimization with implicit solvent.
+    if('Water' in KernelGeom[0] and (KernelElec=='None' or 'GeneralizedBorn' in KernelElec)):
+        return 0
+
+    # Non-matching table settings are pointless
+    if( ('Table' in KernelElec) and ('Table' in KernelVdw) and KernelElec!=KernelVdw ):
+        return 0
+
+    # Try to reduce the number of different switch/shift options to get a reasonable number of kernels
+    # For electrostatics, reaction-field can use 'exactcutoff', and ewald can use switch or shift.
+    if(KernelElecMod=='ExactCutoff' and KernelElec!='ReactionField'):
+        return 0
+    if(KernelElecMod in ['PotentialShift','PotentialSwitch'] and KernelElec!='Ewald'):
+        return 0
+    # For Vdw, we support switch and shift for Lennard-Jones/Buckingham
+    if((KernelVdwMod=='ExactCutoff') or
+       (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham'])):
+        return 0
+
+    # Choose either switch or shift and don't mix them...
+    if((KernelElecMod=='PotentialShift' and KernelVdwMod=='PotentialSwitch') or
+       (KernelElecMod=='PotentialSwitch' and KernelVdwMod=='PotentialShift')):
+        return 0
+
+    # Don't use a Vdw kernel with a modifier if the electrostatics one does not have one
+    if(KernelElec!='None' and KernelElecMod=='None' and KernelVdwMod!='None'):
+        return 0
+
+    # Don't use an electrostatics kernel with a modifier if the vdw one does not have one,
+    # unless the electrostatics one is reaction-field with exact cutoff.
+    if(KernelVdw!='None' and KernelVdwMod=='None' and KernelElecMod!='None'):
+        if(KernelElec=='ReactionField' and KernelVdw!='CubicSplineTable'):
+            return 0
+        elif(KernelElec!='ReactionField'):
+            return 0
+
+    return 1
+
+
+
+#
+# The preprocessor will automatically expand the interactions for water and other
+# geometries inside the kernel, but to get this right we need to setup a couple
+# of defines - we do them in a separate routine to keep the main loop clean.
+#
+# While this routine might look a bit complex it is actually quite straightforward,
+# and the best news is that you wont have to modify _anything_ for a new geometry
+# as long as you correctly define its Electrostatics/Vdw geometry in the lists above!
+#
+def SetDefines(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF,defines):
+    # What is the _name_ for the i/j group geometry?
+    igeometry            = KernelGeom[0]
+    jgeometry            = KernelGeom[1]
+    # define so we can access it in the source when the preprocessor runs
+    defines['GEOMETRY_I'] = igeometry
+    defines['GEOMETRY_J'] = jgeometry
+
+    # For the i/j groups, extract a python list of which sites have electrostatics
+    # For SPC/TIP3p this will be [1,1,1], while TIP4p (no elec on first site) will be [0,1,1,1]
+    ielec                = GeometryElectrostatics[igeometry]
+    jelec                = GeometryElectrostatics[jgeometry]
+    # Zero out the corresponding lists in case we dont do Elec
+    if(KernelElec=='None'):
+        ielec = []
+        jelec = []
+
+    # Extract similar interaction lists for Vdw interactions (example for SPC: [1,0,0])
+    iVdw                 = GeometryVdw[igeometry]
+    jVdw                 = GeometryVdw[jgeometry]
+
+    # Zero out the corresponding lists in case we dont do Vdw
+    if(KernelVdw=='None'):
+        iVdw = []
+        jVdw = []
+
+    # iany[] and jany[] contains lists of the particles actually used (for interactions) in this kernel
+    iany = list(set(ielec+iVdw))  # convert to+from set to make elements unique
+    jany = list(set(jelec+jVdw))
+
+    defines['PARTICLES_ELEC_I'] = ielec
+    defines['PARTICLES_ELEC_J'] = jelec
+    defines['PARTICLES_VDW_I']  = iVdw
+    defines['PARTICLES_VDW_J']  = jVdw
+    defines['PARTICLES_I']      = iany
+    defines['PARTICLES_J']      = jany
+
+    # elecij,Vdwij are sets with pairs of particles for which the corresponding interaction is done
+    # (and anyij again corresponds to either electrostatics or Vdw)
+    elecij = []
+    Vdwij  = []
+    anyij  = []
+
+    for i in ielec:
+        for j in jelec:
+            elecij.append([i,j])
+
+    for i in iVdw:
+        for j in jVdw:
+            Vdwij.append([i,j])
+
+    for i in iany:
+        for j in jany:
+            if [i,j] in elecij or [i,j] in Vdwij:
+                anyij.append([i,j])
+
+    defines['PAIRS_IJ']     = anyij
+
+    # Make an 2d list-of-distance-properties-to-calculate for i,j
+    ni = max(iany)+1
+    nj = max(jany)+1
+    # Each element properties[i][j] is an empty list
+    properties = [ [ [] for j in range(0,nj) ] for i in range (0,ni) ]
+    # Add properties to each set
+    for i in range(0,ni):
+        for j in range(0,nj):
+            if [i,j] in elecij:
+                properties[i][j] = properties[i][j] + ['electrostatics'] + ElectrostaticsList[KernelElec] + ModifierList[KernelElecMod]
+            if [i,j] in Vdwij:
+                properties[i][j] = properties[i][j] + ['vdw'] + VdwList[KernelVdw] + ModifierList[KernelVdwMod]
+            # Add rinv if we need r
+            if 'r' in properties[i][j]:
+                properties[i][j] = properties[i][j] + ['rinv']
+            # Add rsq if we need rinv or rinsq
+            if 'rinv' in properties[i][j] or 'rinvsq' in properties[i][j]:
+                properties[i][j] = properties[i][j] + ['rsq']
+
+    defines['INTERACTION_FLAGS']    = properties
+
+
+
+def PrintStatistics(ratio):
+    ratio = 100.0*ratio
+    print '\rGenerating %s nonbonded kernels... %5.1f%%' % (Arch,ratio),
+    sys.stdout.flush()
+
+
+
+defines = {}
+kerneldecl = []
+
+cnt     = 0.0
+nelec   = len(ElectrostaticsList)
+nVdw    = len(VdwList)
+nmod    = len(ModifierList)
+ngeom   = len(GeometryNameList)
+
+ntot    = nelec*nmod*nVdw*nmod*ngeom
+
+numKernels = 0
+
+fpdecl = open('nb_kernel_' + Arch + '.c','w')
+fpdecl.write( FileHeader )
+fpdecl.write( '#ifndef nb_kernel_' + Arch + '_h\n' )
+fpdecl.write( '#define nb_kernel_' + Arch + '_h\n\n' )
+fpdecl.write( '#include "../nb_kernel.h"\n\n' )
+
+for KernelElec in ElectrostaticsList:
+    defines['KERNEL_ELEC'] = KernelElec
+
+    for KernelElecMod in ModifierList:
+        defines['KERNEL_MOD_ELEC'] = KernelElecMod
+
+        for KernelVdw in VdwList:
+            defines['KERNEL_VDW'] = KernelVdw
+
+            for KernelVdwMod in ModifierList:
+                defines['KERNEL_MOD_VDW'] = KernelVdwMod
+
+                for KernelGeom in GeometryNameList:
+
+                    cnt += 1
+                    KernelFilename = MakeKernelFileName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom) + '.c'
+                    fpkernel = open(KernelFilename,'w')
+                    defines['INCLUDE_HEADER'] = 1  # Include header first time in new file
+                    DoHeader = 1
+
+                    for KernelVF in VFList:
+
+                        KernelName = MakeKernelName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF)
+
+                        defines['KERNEL_NAME'] = KernelName
+                        defines['KERNEL_VF']   = KernelVF
+
+                        # Check if this is a valid/sane/usable combination
+                        if not KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF):
+                            continue;
+
+                        # The overall kernel settings determine what the _kernel_ calculates, but for the water
+                        # kernels this does not mean that every pairwise interaction has e.g. Vdw interactions.
+                        # This routine sets defines of what to calculate for each pair of particles in those cases.
+                        SetDefines(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF,defines)
+
+                        if(DoHeader==1):
+                            fpkernel.write( FileHeader )
+
+                        gmxpreprocess('nb_kernel_template_' + Arch + '.pre', KernelName+'.tmp' , defines, force=1,contentType='C')
+                        numKernels = numKernels + 1
+
+                        defines['INCLUDE_HEADER'] = 0   # Header has been included once now
+                        DoHeader=0
+
+                        # Append temp file contents to the common kernelfile
+                        fptmp = open(KernelName+'.tmp','r')
+                        fpkernel.writelines(fptmp.readlines())
+                        fptmp.close()
+                        os.remove(KernelName+'.tmp')
+
+                        # Add a declaration for this kernel
+                        fpdecl.write('nb_kernel_t ' + KernelName + ';\n');
+
+                        # Add declaration to the buffer
+                        KernelOther=''
+                        kerneldecl.append(MakeKernelDecl(KernelName,KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelOther,KernelVF))
+
+                    filesize = fpkernel.tell()
+                    fpkernel.close()
+                    if(filesize==0):
+                        os.remove(KernelFilename)
+
+                    PrintStatistics(cnt/ntot)
+                pass
+            pass
+        pass
+    pass
+pass
+
+# Write out the list of settings and corresponding kernels to the declaration file
+fpdecl.write( '\n\n' )
+fpdecl.write( 'nb_kernel_info_t\n' )
+fpdecl.write( 'kernellist_'+Arch+'[] =\n' )
+fpdecl.write( '{\n' )
+for decl in kerneldecl[0:-1]:
+    fpdecl.write( decl + ',\n' )
+fpdecl.write( kerneldecl[-1] + '\n' )
+fpdecl.write( '};\n\n' )
+fpdecl.write( 'int\n' )
+fpdecl.write( 'kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n')
+fpdecl.write( '#endif\n')
+fpdecl.close()
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..4b3773d
--- /dev/null
@@ -0,0 +1,711 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 76 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 76 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*76);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 64 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 64 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*64);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..66d4fc9
--- /dev/null
@@ -0,0 +1,1173 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 171 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 171 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*171);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 151 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 151 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*151);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..db19a17
--- /dev/null
@@ -0,0 +1,2311 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq01,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq02,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 444 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq01,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq02,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 444 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*444);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 400 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 400 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*400);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..70be5c8
--- /dev/null
@@ -0,0 +1,1329 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq30,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 200 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq30,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 200 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*200);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 180 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 180 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*180);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..7ef236b
--- /dev/null
@@ -0,0 +1,2479 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq13,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq23,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq31,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq32,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq33,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 476 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq13,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq23,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq31,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq32,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq33,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 476 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*476);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 432 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 432 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*432);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..c78c355
--- /dev/null
@@ -0,0 +1,635 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 59 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 59 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*59);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 50 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 50 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*50);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..602513b
--- /dev/null
@@ -0,0 +1,1097 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 154 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 154 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*154);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 137 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 137 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*137);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..18e4b9a
--- /dev/null
@@ -0,0 +1,2235 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq01,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq02,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 427 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq01,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq02,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 427 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*427);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 386 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 386 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*386);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..635418c
--- /dev/null
@@ -0,0 +1,1201 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq30,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 176 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq30,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 176 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*176);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 159 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 159 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*159);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..4c49563
--- /dev/null
@@ -0,0 +1,2351 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq13,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq23,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq31,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq32,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq33,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 452 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq13,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq23,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq31,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq32,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq33,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 452 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*452);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 411 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 411 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*411);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..d12f4a4
--- /dev/null
@@ -0,0 +1,564 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 46 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 46 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 8 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*46);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 42 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 42 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*42);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..6f93895
--- /dev/null
@@ -0,0 +1,1026 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 141 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 141 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*141);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*129);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..d5e0083
--- /dev/null
@@ -0,0 +1,2170 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq01,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq02,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 414 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq00,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq01,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq02,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 414 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*414);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 378 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,FF),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r01,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,FF),_fjsp_mul_v2r8(vftabscale,rinv01)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r02,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,FF),_fjsp_mul_v2r8(vftabscale,rinv02)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 378 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*378);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..92809fa
--- /dev/null
@@ -0,0 +1,1026 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq30,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 141 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq10,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq20,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq30,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 141 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*141);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r10,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,FF),_fjsp_mul_v2r8(vftabscale,rinv10)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r20,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,FF),_fjsp_mul_v2r8(vftabscale,rinv20)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r30,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,FF),_fjsp_mul_v2r8(vftabscale,rinv30)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*129);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCSTab_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..c865db2
--- /dev/null
@@ -0,0 +1,2170 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq13,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq23,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq31,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq32,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq33,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 414 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq11,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq12,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq13,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq21,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq22,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq23,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq31,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq32,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq33,VV);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 414 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*414);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: CubicSplineTable
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 378 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r11,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,FF),_fjsp_mul_v2r8(vftabscale,rinv11)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r12,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,FF),_fjsp_mul_v2r8(vftabscale,rinv12)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r13,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,FF),_fjsp_mul_v2r8(vftabscale,rinv13)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r21,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,FF),_fjsp_mul_v2r8(vftabscale,rinv21)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r22,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,FF),_fjsp_mul_v2r8(vftabscale,rinv22)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r23,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,FF),_fjsp_mul_v2r8(vftabscale,rinv23)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r31,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,FF),_fjsp_mul_v2r8(vftabscale,rinv31)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r32,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,FF),_fjsp_mul_v2r8(vftabscale,rinv32)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r33,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,FF),_fjsp_mul_v2r8(vftabscale,rinv33)));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 378 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*378);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..6acc0a7
--- /dev/null
@@ -0,0 +1,679 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 66 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 66 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*66);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 57 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 57 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*57);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..c48a755
--- /dev/null
@@ -0,0 +1,989 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 131 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 131 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*131);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*120);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..3786c76
--- /dev/null
@@ -0,0 +1,1671 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 314 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 314 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*314);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 297 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 297 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*297);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..d299daf
--- /dev/null
@@ -0,0 +1,1097 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 155 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 155 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*155);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 144 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 144 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*144);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..54f9d59
--- /dev/null
@@ -0,0 +1,1791 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 341 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 341 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*341);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 324 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 324 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*324);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..c5950bc
--- /dev/null
@@ -0,0 +1,545 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 43 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 43 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*43);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 37 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 37 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*37);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..83f4c9b
--- /dev/null
@@ -0,0 +1,855 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*108);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 100 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 100 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*100);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..56f53a1
--- /dev/null
@@ -0,0 +1,1537 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 291 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 291 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*291);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 277 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 277 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*277);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..a52b05d
--- /dev/null
@@ -0,0 +1,963 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 131 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 131 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*131);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 123 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 123 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*123);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..44e3580
--- /dev/null
@@ -0,0 +1,1657 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 317 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 317 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*317);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 303 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 303 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*303);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..d3b90c8
--- /dev/null
@@ -0,0 +1,482 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 31 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 31 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 8 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*31);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 30 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 30 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*30);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..dbcbeab
--- /dev/null
@@ -0,0 +1,792 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 96 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 96 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*96);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 93 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 93 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*93);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..55fe116
--- /dev/null
@@ -0,0 +1,1480 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 279 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 279 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*279);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq00);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,rinv01);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq01);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,rinv02);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq02);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*270);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..faa314c
--- /dev/null
@@ -0,0 +1,792 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 96 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 96 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*96);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 93 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,rinv10);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq10);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,rinv20);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq20);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,rinv30);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq30);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 93 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*93);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecCoul_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..51d8be8
--- /dev/null
@@ -0,0 +1,1480 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 279 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 279 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*279);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Coulomb
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,rinv11);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq11);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,rinv12);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq12);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,rinv13);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq13);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,rinv21);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq21);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,rinv22);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq22);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,rinv23);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq23);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,rinv31);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq31);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,rinv32);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq32);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,rinv33);
+            felec            = _fjsp_mul_v2r8(velec,rinvsq33);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*270);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..c6d3ca7
--- /dev/null
@@ -0,0 +1,672 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 67 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 67 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*67);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 49 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 49 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*49);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..e9d6af0
--- /dev/null
@@ -0,0 +1,1168 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 168 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 168 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*168);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 136 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 136 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*136);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..7f31d05
--- /dev/null
@@ -0,0 +1,2408 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv01,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv02,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 459 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv01,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv02,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 459 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*459);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 385 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 385 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*385);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..0e61a06
--- /dev/null
@@ -0,0 +1,1312 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv30,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 194 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv30,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 194 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*194);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 162 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 162 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*162);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..54aa1f9
--- /dev/null
@@ -0,0 +1,2564 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv13,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv23,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv31,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv32,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv33,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 488 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv13,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv23,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv31,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv32,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv33,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 488 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*488);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 414 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 414 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*414);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..29c0330
--- /dev/null
@@ -0,0 +1,599 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 49 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 49 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 8 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*49);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 42 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 42 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*42);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..2e68463
--- /dev/null
@@ -0,0 +1,1095 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 150 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 150 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*150);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*129);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..9f2d5ae
--- /dev/null
@@ -0,0 +1,2341 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv01,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv02,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 441 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv00,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv01,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv02,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 441 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*441);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 378 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 378 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*378);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..8c18557
--- /dev/null
@@ -0,0 +1,1095 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv30,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 150 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv10,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv20,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv30,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 150 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*150);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*129);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSh_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..b89e62b
--- /dev/null
@@ -0,0 +1,2341 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv13,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv23,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv31,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv32,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv33,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 441 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv11,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv12,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv13,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv21,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv22,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv23,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv31,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv32,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv33,sh_ewald),velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 441 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*441);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 378 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 378 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*378);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..0fb4e05
--- /dev/null
@@ -0,0 +1,759 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 86 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 86 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*86);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 80 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 80 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*80);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..9fd2aff
--- /dev/null
@@ -0,0 +1,1365 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 225 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 225 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*225);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 213 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 213 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*213);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..4f7cf4c
--- /dev/null
@@ -0,0 +1,2935 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            d                = _fjsp_sub_v2r8(r01,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            d                = _fjsp_sub_v2r8(r02,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 630 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            d                = _fjsp_sub_v2r8(r01,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            d                = _fjsp_sub_v2r8(r02,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 630 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*630);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            d                = _fjsp_sub_v2r8(r01,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            d                = _fjsp_sub_v2r8(r02,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 600 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            d                = _fjsp_sub_v2r8(r01,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            d                = _fjsp_sub_v2r8(r02,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 600 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*600);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..1a8925d
--- /dev/null
@@ -0,0 +1,1557 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            d                = _fjsp_sub_v2r8(r30,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 269 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            d                = _fjsp_sub_v2r8(r30,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 269 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*269);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            d                = _fjsp_sub_v2r8(r30,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 257 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            d                = _fjsp_sub_v2r8(r30,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 257 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*257);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..d76393f
--- /dev/null
@@ -0,0 +1,3139 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            d                = _fjsp_sub_v2r8(r13,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            d                = _fjsp_sub_v2r8(r23,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            d                = _fjsp_sub_v2r8(r31,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            d                = _fjsp_sub_v2r8(r32,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            d                = _fjsp_sub_v2r8(r33,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 677 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            d                = _fjsp_sub_v2r8(r13,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            d                = _fjsp_sub_v2r8(r23,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            d                = _fjsp_sub_v2r8(r31,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            d                = _fjsp_sub_v2r8(r32,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            d                = _fjsp_sub_v2r8(r33,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 677 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*677);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            d                = _fjsp_sub_v2r8(r13,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            d                = _fjsp_sub_v2r8(r23,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            d                = _fjsp_sub_v2r8(r31,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            d                = _fjsp_sub_v2r8(r32,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            d                = _fjsp_sub_v2r8(r33,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 647 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            d                = _fjsp_sub_v2r8(r13,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            d                = _fjsp_sub_v2r8(r23,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            d                = _fjsp_sub_v2r8(r31,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            d                = _fjsp_sub_v2r8(r32,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            d                = _fjsp_sub_v2r8(r33,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 647 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*647);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..898dec9
--- /dev/null
@@ -0,0 +1,682 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 68 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 68 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 8 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*68);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 65 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 65 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*65);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..e14474f
--- /dev/null
@@ -0,0 +1,1288 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 207 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 207 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*207);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 198 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 198 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*198);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..a176d61
--- /dev/null
@@ -0,0 +1,2864 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            d                = _fjsp_sub_v2r8(r01,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            d                = _fjsp_sub_v2r8(r02,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 612 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            d                = _fjsp_sub_v2r8(r01,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            d                = _fjsp_sub_v2r8(r02,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 612 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*612);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            d                = _fjsp_sub_v2r8(r01,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            d                = _fjsp_sub_v2r8(r02,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 585 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            d                = _fjsp_sub_v2r8(r01,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv01,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            d                = _fjsp_sub_v2r8(r02,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv02,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 585 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*585);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..ff1300a
--- /dev/null
@@ -0,0 +1,1288 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            d                = _fjsp_sub_v2r8(r30,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 207 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            d                = _fjsp_sub_v2r8(r30,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 207 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*207);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            d                = _fjsp_sub_v2r8(r30,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 198 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            d                = _fjsp_sub_v2r8(r10,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv10,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            d                = _fjsp_sub_v2r8(r20,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv20,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            d                = _fjsp_sub_v2r8(r30,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv30,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 198 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*198);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEwSw_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..66de2dd
--- /dev/null
@@ -0,0 +1,2864 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            d                = _fjsp_sub_v2r8(r13,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            d                = _fjsp_sub_v2r8(r23,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            d                = _fjsp_sub_v2r8(r31,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            d                = _fjsp_sub_v2r8(r32,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            d                = _fjsp_sub_v2r8(r33,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 612 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            d                = _fjsp_sub_v2r8(r13,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            d                = _fjsp_sub_v2r8(r23,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            d                = _fjsp_sub_v2r8(r31,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            d                = _fjsp_sub_v2r8(r32,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            d                = _fjsp_sub_v2r8(r33,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 612 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*612);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            d                = _fjsp_sub_v2r8(r13,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            d                = _fjsp_sub_v2r8(r23,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            d                = _fjsp_sub_v2r8(r31,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            d                = _fjsp_sub_v2r8(r32,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            d                = _fjsp_sub_v2r8(r33,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 585 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            d                = _fjsp_sub_v2r8(r11,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv11,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            d                = _fjsp_sub_v2r8(r12,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv12,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            d                = _fjsp_sub_v2r8(r13,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv13,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            d                = _fjsp_sub_v2r8(r21,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv21,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            d                = _fjsp_sub_v2r8(r22,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv22,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            d                = _fjsp_sub_v2r8(r23,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv23,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            d                = _fjsp_sub_v2r8(r31,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv31,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            d                = _fjsp_sub_v2r8(r32,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv32,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            d                = _fjsp_sub_v2r8(r33,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv33,_fjsp_mul_v2r8(velec,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 585 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*585);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..3f30f96
--- /dev/null
@@ -0,0 +1,740 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 78 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 78 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*78);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 65 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 65 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*65);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..2ebbb0d
--- /dev/null
@@ -0,0 +1,1160 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 169 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 169 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*169);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 146 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 146 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*146);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..6ab689a
--- /dev/null
@@ -0,0 +1,2172 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 430 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 430 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*430);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 377 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 377 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*377);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..554c18d
--- /dev/null
@@ -0,0 +1,1276 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 194 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 194 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*194);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 171 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 171 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*171);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..63d5a18
--- /dev/null
@@ -0,0 +1,2300 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 458 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 458 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*458);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 405 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 405 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*405);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..eb3a7a1
--- /dev/null
@@ -0,0 +1,614 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 56 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 56 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*56);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 46 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 46 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*46);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..b82d07d
--- /dev/null
@@ -0,0 +1,1034 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 147 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 147 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*147);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 127 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 127 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*127);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..a4d4afe
--- /dev/null
@@ -0,0 +1,2046 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 408 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 408 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*408);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 358 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 358 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*358);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..c21ef40
--- /dev/null
@@ -0,0 +1,1142 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 170 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 170 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*170);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 150 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 150 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*150);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..faa0f8e
--- /dev/null
@@ -0,0 +1,2166 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 434 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 434 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*434);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 384 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 384 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*384);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..a26a79b
--- /dev/null
@@ -0,0 +1,551 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 44 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 44 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 8 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*44);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 39 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 39 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*39);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..9716ad0
--- /dev/null
@@ -0,0 +1,971 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 135 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 135 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*135);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*120);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..91814d3
--- /dev/null
@@ -0,0 +1,1989 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 396 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(rinv00,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(rinv01,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(rinv02,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 396 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*396);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r00,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq00,rinv00),_fjsp_sub_v2r8(rinvsq00,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r01              = _fjsp_mul_v2r8(rsq01,rinv01);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r01,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq01,rinv01),_fjsp_sub_v2r8(rinvsq01,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r02              = _fjsp_mul_v2r8(rsq02,rinv02);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r02,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq02,rinv02),_fjsp_sub_v2r8(rinvsq02,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*351);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..c787566
--- /dev/null
@@ -0,0 +1,971 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 135 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(rinv10,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(rinv20,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(rinv30,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 135 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*135);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r10              = _fjsp_mul_v2r8(rsq10,rinv10);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r10,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq10,rinv10),_fjsp_sub_v2r8(rinvsq10,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r20              = _fjsp_mul_v2r8(rsq20,rinv20);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r20,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq20,rinv20),_fjsp_sub_v2r8(rinvsq20,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r30              = _fjsp_mul_v2r8(rsq30,rinv30);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r30,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq30,rinv30),_fjsp_sub_v2r8(rinvsq30,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*120);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecEw_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..24cbc9a
--- /dev/null
@@ -0,0 +1,1989 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 396 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(rinv11,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(rinv12,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(rinv13,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(rinv21,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(rinv22,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(rinv23,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(rinv31,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(rinv32,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            ewtabD           = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            ewtabFn          = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(rinv33,velec));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 396 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*396);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: Ewald
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r11              = _fjsp_mul_v2r8(rsq11,rinv11);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r11,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq11,rinv11),_fjsp_sub_v2r8(rinvsq11,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r12              = _fjsp_mul_v2r8(rsq12,rinv12);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r12,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq12,rinv12),_fjsp_sub_v2r8(rinvsq12,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r13              = _fjsp_mul_v2r8(rsq13,rinv13);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r13,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq13,rinv13),_fjsp_sub_v2r8(rinvsq13,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r21              = _fjsp_mul_v2r8(rsq21,rinv21);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r21,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq21,rinv21),_fjsp_sub_v2r8(rinvsq21,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r22              = _fjsp_mul_v2r8(rsq22,rinv22);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r22,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq22,rinv22),_fjsp_sub_v2r8(rinvsq22,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r23              = _fjsp_mul_v2r8(rsq23,rinv23);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r23,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq23,rinv23),_fjsp_sub_v2r8(rinvsq23,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r31              = _fjsp_mul_v2r8(rsq31,rinv31);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r31,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq31,rinv31),_fjsp_sub_v2r8(rinvsq31,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r32              = _fjsp_mul_v2r8(rsq32,rinv32);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r32,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq32,rinv32),_fjsp_sub_v2r8(rinvsq32,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r33              = _fjsp_mul_v2r8(rsq33,rinv33);
+
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r33,ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq33,rinv33),_fjsp_sub_v2r8(rinvsq33,felec));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*351);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..93a7338
--- /dev/null
@@ -0,0 +1,820 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+    _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        isai0            = gmx_fjsp_load1_v2r8(invsqrta+inr+0);
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vgbsum           = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+        dvdasum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            isaj0            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+0,invsqrta+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 95 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            isaj0            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vgb              = _fjsp_unpacklo_v2r8(vgb,_fjsp_setzero_v2r8());
+            vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 95 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vgbsum,kernel_data->energygrp_polarization+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+        dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai0,isai0));
+        gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*95);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+    _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        isai0            = gmx_fjsp_load1_v2r8(invsqrta+inr+0);
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        dvdasum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            isaj0            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+0,invsqrta+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 85 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            isaj0            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 85 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai0,isai0));
+        gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*85);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..0a58534
--- /dev/null
@@ -0,0 +1,706 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+    _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        isai0            = gmx_fjsp_load1_v2r8(invsqrta+inr+0);
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vgbsum           = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+        dvdasum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            isaj0            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+0,invsqrta+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 74 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            isaj0            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vgb              = _fjsp_unpacklo_v2r8(vgb,_fjsp_setzero_v2r8());
+            vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 74 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vgbsum,kernel_data->energygrp_polarization+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+        dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai0,isai0));
+        gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 10 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*74);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+    _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        isai0            = gmx_fjsp_load1_v2r8(invsqrta+inr+0);
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        dvdasum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            isaj0            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+0,invsqrta+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 67 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            isaj0            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 67 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai0,isai0));
+        gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*67);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..bc2317e
--- /dev/null
@@ -0,0 +1,635 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+    _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        isai0            = gmx_fjsp_load1_v2r8(invsqrta+inr+0);
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vgbsum           = _fjsp_setzero_v2r8();
+        dvdasum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            isaj0            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+0,invsqrta+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 61 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            isaj0            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vgb              = _fjsp_unpacklo_v2r8(vgb,_fjsp_setzero_v2r8());
+            vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 61 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vgbsum,kernel_data->energygrp_polarization+ggid);
+        dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai0,isai0));
+        gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*9 + inneriter*61);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: GeneralizedBorn
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+    _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        isai0            = gmx_fjsp_load1_v2r8(invsqrta+inr+0);
+
+        dvdasum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            isaj0            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+0,invsqrta+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 59 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            isaj0            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai0,isaj0);
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq00,_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r00,gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r00,vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj0,isaj0)));
+            velec            = _fjsp_mul_v2r8(qq00,rinv00);
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv00,fgb),rinv00);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 59 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai0,isai0));
+        gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*59);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..9a0b84a
--- /dev/null
@@ -0,0 +1,632 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: None
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 59 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 59 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*59);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: None
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 51 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 51 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*51);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..7bd48ad
--- /dev/null
@@ -0,0 +1,552 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 44 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 44 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*44);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 33 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 33 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*33);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..efe5597
--- /dev/null
@@ -0,0 +1,636 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 62 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 62 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*62);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    rcutoff_scalar   = fr->rvdw;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 59 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 59 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*59);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecNone_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..8a22af6
--- /dev/null
@@ -0,0 +1,498 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 35 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 35 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*35);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: None
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 30 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+
+            /* Load parameters for j particles */
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 30 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 6 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*30);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..d1895e9
--- /dev/null
@@ -0,0 +1,733 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 75 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 75 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*75);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 60 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 60 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*60);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..2aa77da
--- /dev/null
@@ -0,0 +1,1115 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 156 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 156 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*156);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 129 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*129);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..58ab23f
--- /dev/null
@@ -0,0 +1,2013 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 387 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 387 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*387);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 324 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 324 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*324);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..75eba66
--- /dev/null
@@ -0,0 +1,1221 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 179 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 179 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*179);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 153 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 153 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*153);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..99a176c
--- /dev/null
@@ -0,0 +1,2131 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 413 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 413 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*413);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*351);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..aba7840
--- /dev/null
@@ -0,0 +1,607 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 57 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 57 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*57);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 40 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 40 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*40);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..8a88a91
--- /dev/null
@@ -0,0 +1,989 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 138 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 138 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*138);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 109 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 109 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*109);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..d3b9655
--- /dev/null
@@ -0,0 +1,1887 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 369 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 369 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*369);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 304 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 304 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*304);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..c4597a8
--- /dev/null
@@ -0,0 +1,1133 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 164 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 164 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*164);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 135 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 135 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*135);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..4ebe12c
--- /dev/null
@@ -0,0 +1,2043 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 398 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_00,_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 398 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*398);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 333 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 333 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*333);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..dd908d3
--- /dev/null
@@ -0,0 +1,683 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 73 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 73 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*73);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 64 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 64 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*64);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..36c26d1
--- /dev/null
@@ -0,0 +1,1065 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 154 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 154 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*154);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 133 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 133 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*133);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..9a66a46
--- /dev/null
@@ -0,0 +1,1963 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 385 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 385 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*385);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 328 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 328 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*328);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..7add775
--- /dev/null
@@ -0,0 +1,1213 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 182 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 182 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*182);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 161 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 161 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*161);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..0d66250
--- /dev/null
@@ -0,0 +1,2123 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 416 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 416 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*416);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 359 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            d                = _fjsp_sub_v2r8(r00,rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+
+            /* Evaluate switch function */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv00,_fjsp_mul_v2r8(vvdw,dsw)) );
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 359 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*359);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..da6aa20
--- /dev/null
@@ -0,0 +1,534 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 39 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 39 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 8 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*39);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 33 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            }
+
+            /* Inner loop uses 33 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*33);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..8556bfe
--- /dev/null
@@ -0,0 +1,916 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*120);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 102 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 102 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*102);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..281bccc
--- /dev/null
@@ -0,0 +1,1820 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*351);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 297 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq00,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq00,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq01,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq01,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq02,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq02,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 297 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*297);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..4896f8f
--- /dev/null
@@ -0,0 +1,916 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*120);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 102 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq10,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq10,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq20,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq20,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq30,rcutoff2))
+            {
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq30,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            }
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 102 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*102);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRFCut_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..7d92949
--- /dev/null
@@ -0,0 +1,1820 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 351 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*351);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 297 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq11,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq11,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq12,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq12,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq13,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq13,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq21,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq21,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq22,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq22,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq23,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq23,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq31,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq31,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq32,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq32,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            }
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            if (gmx_fjsp_any_lt_v2r8(rsq33,rcutoff2))
+            {
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq33,rcutoff2);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            }
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 297 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*297);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..32d447f
--- /dev/null
@@ -0,0 +1,683 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 70 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 70 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*70);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 57 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 57 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*57);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..681fc26
--- /dev/null
@@ -0,0 +1,989 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 143 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 143 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*143);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*120);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..b314416
--- /dev/null
@@ -0,0 +1,1659 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 350 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 350 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*350);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 297 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 297 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*297);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..22d3111
--- /dev/null
@@ -0,0 +1,1097 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 167 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 167 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*167);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 144 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 144 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*144);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwCSTab_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..1688019
--- /dev/null
@@ -0,0 +1,1779 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 377 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,VV);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 377 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*377);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            CubicSplineTable
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 324 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            r00              = _fjsp_mul_v2r8(rsq00,rinv00);
+
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r00,vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_00,FF);
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            F                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            H                = _fjsp_setzero_v2r8();
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_00,FF);
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv00)));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 324 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*324);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..dc84753
--- /dev/null
@@ -0,0 +1,549 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 47 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 47 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 9 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*47);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+        vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 37 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 37 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*37);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..13978d1
--- /dev/null
@@ -0,0 +1,855 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 120 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*120);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 100 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 100 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*100);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..b419e70
--- /dev/null
@@ -0,0 +1,1525 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 327 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 327 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 20 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*327);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 277 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 277 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*277);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..45cb100
--- /dev/null
@@ -0,0 +1,963 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 143 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 143 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*143);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+            vdwjidx0B        = 2*vdwtype[jnrB+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,
+                                         vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 123 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+            vdwjidx0A        = 2*vdwtype[jnrA+0];
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 123 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*123);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwLJ_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..ae69a1d
--- /dev/null
@@ -0,0 +1,1645 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+        vvdwsum          = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 353 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            vvdw6            = _fjsp_mul_v2r8(c6_00,rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_00,_fjsp_mul_v2r8(rinvsix,rinvsix));
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq00);
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 353 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 26 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*353);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            LennardJones
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    vdwjidx0A        = 2*vdwtype[inr+0];
+    c6_00            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A]);
+    c12_00           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset0+vdwjidx0A+1]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 303 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq00         = gmx_fjsp_inv_v2r8(rsq00);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq00,rinvsq00),rinvsq00);
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_00,rinvsix,c6_00),_fjsp_mul_v2r8(rinvsix,rinvsq00));
+
+            fscal            = fvdw;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 303 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 24 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*303);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomP1P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..7155768
--- /dev/null
@@ -0,0 +1,486 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 35 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 35 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 8 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*35);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Particle-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+
+        /* Load parameters for i particles */
+        iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+0));
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 30 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx00,dy00,dz00);
+
+            /* Inner loop uses 30 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 7 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*30);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW3P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..e57c2e1
--- /dev/null
@@ -0,0 +1,792 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*108);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 93 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq00             = _fjsp_mul_v2r8(iq0,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 93 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*93);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW3W3_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..7b31dd4
--- /dev/null
@@ -0,0 +1,1468 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 315 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq00,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq00,rinv00),crf));
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq01,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq01,rinv01),crf));
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq02,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq02,rinv02),crf));
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 315 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*315);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water3-Water3
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset0;
+    _fjsp_v2r8       ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    _fjsp_v2r8       dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
+    _fjsp_v2r8       dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
+    _fjsp_v2r8       dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq0              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+0]));
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+
+    jq0              = gmx_fjsp_set1_v2r8(charge[inr+0]);
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    qq00             = _fjsp_mul_v2r8(iq0,jq0);
+    qq01             = _fjsp_mul_v2r8(iq0,jq1);
+    qq02             = _fjsp_mul_v2r8(iq0,jq2);
+    qq10             = _fjsp_mul_v2r8(iq1,jq0);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq20             = _fjsp_mul_v2r8(iq2,jq0);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
+        fix0             = _fjsp_setzero_v2r8();
+        fiy0             = _fjsp_setzero_v2r8();
+        fiz0             = _fjsp_setzero_v2r8();
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+
+            /* Calculate displacement vector */
+            dx00             = _fjsp_sub_v2r8(ix0,jx0);
+            dy00             = _fjsp_sub_v2r8(iy0,jy0);
+            dz00             = _fjsp_sub_v2r8(iz0,jz0);
+            dx01             = _fjsp_sub_v2r8(ix0,jx1);
+            dy01             = _fjsp_sub_v2r8(iy0,jy1);
+            dz01             = _fjsp_sub_v2r8(iz0,jz1);
+            dx02             = _fjsp_sub_v2r8(ix0,jx2);
+            dy02             = _fjsp_sub_v2r8(iy0,jy2);
+            dz02             = _fjsp_sub_v2r8(iz0,jz2);
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+
+            /* Calculate squared distance and things based on it */
+            rsq00            = gmx_fjsp_calc_rsq_v2r8(dx00,dy00,dz00);
+            rsq01            = gmx_fjsp_calc_rsq_v2r8(dx01,dy01,dz01);
+            rsq02            = gmx_fjsp_calc_rsq_v2r8(dx02,dy02,dz02);
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+
+            rinv00           = gmx_fjsp_invsqrt_v2r8(rsq00);
+            rinv01           = gmx_fjsp_invsqrt_v2r8(rsq01);
+            rinv02           = gmx_fjsp_invsqrt_v2r8(rsq02);
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+
+            rinvsq00         = _fjsp_mul_v2r8(rinv00,rinv00);
+            rinvsq01         = _fjsp_mul_v2r8(rinv01,rinv01);
+            rinvsq02         = _fjsp_mul_v2r8(rinv02,rinv02);
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq00,_fjsp_msub_v2r8(rinv00,rinvsq00,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx00,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy00,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz00,fscal,fiz0);
+            
+            fjx0             = _fjsp_madd_v2r8(dx00,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy00,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz00,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq01,_fjsp_msub_v2r8(rinv01,rinvsq01,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx01,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy01,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz01,fscal,fiz0);
+            
+            fjx1             = _fjsp_madd_v2r8(dx01,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy01,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz01,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq02,_fjsp_msub_v2r8(rinv02,rinvsq02,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix0             = _fjsp_madd_v2r8(dx02,fscal,fix0);
+            fiy0             = _fjsp_madd_v2r8(dy02,fscal,fiy0);
+            fiz0             = _fjsp_madd_v2r8(dz02,fscal,fiz0);
+            
+            fjx2             = _fjsp_madd_v2r8(dx02,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy02,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz02,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*270);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW4P1_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..89c9e8c
--- /dev/null
@@ -0,0 +1,792 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq10,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq10,rinv10),crf));
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq20,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq20,rinv20),crf));
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq30,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq30,rinv30),crf));
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 108 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*108);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Particle
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx0A,vdwjidx0B;
+    _fjsp_v2r8       jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
+    _fjsp_v2r8       dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
+    _fjsp_v2r8       dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
+    _fjsp_v2r8       dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+0,charge+jnrB+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 93 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+
+            /* Calculate displacement vector */
+            dx10             = _fjsp_sub_v2r8(ix1,jx0);
+            dy10             = _fjsp_sub_v2r8(iy1,jy0);
+            dz10             = _fjsp_sub_v2r8(iz1,jz0);
+            dx20             = _fjsp_sub_v2r8(ix2,jx0);
+            dy20             = _fjsp_sub_v2r8(iy2,jy0);
+            dz20             = _fjsp_sub_v2r8(iz2,jz0);
+            dx30             = _fjsp_sub_v2r8(ix3,jx0);
+            dy30             = _fjsp_sub_v2r8(iy3,jy0);
+            dz30             = _fjsp_sub_v2r8(iz3,jz0);
+
+            /* Calculate squared distance and things based on it */
+            rsq10            = gmx_fjsp_calc_rsq_v2r8(dx10,dy10,dz10);
+            rsq20            = gmx_fjsp_calc_rsq_v2r8(dx20,dy20,dz20);
+            rsq30            = gmx_fjsp_calc_rsq_v2r8(dx30,dy30,dz30);
+
+            rinv10           = gmx_fjsp_invsqrt_v2r8(rsq10);
+            rinv20           = gmx_fjsp_invsqrt_v2r8(rsq20);
+            rinv30           = gmx_fjsp_invsqrt_v2r8(rsq30);
+
+            rinvsq10         = _fjsp_mul_v2r8(rinv10,rinv10);
+            rinvsq20         = _fjsp_mul_v2r8(rinv20,rinv20);
+            rinvsq30         = _fjsp_mul_v2r8(rinv30,rinv30);
+
+            /* Load parameters for j particles */
+            jq0              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+0);
+
+            fjx0             = _fjsp_setzero_v2r8();
+            fjy0             = _fjsp_setzero_v2r8();
+            fjz0             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq10             = _fjsp_mul_v2r8(iq1,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq10,_fjsp_msub_v2r8(rinv10,rinvsq10,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx10,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy10,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz10,fscal,fiz1);
+            
+            fjx0             = _fjsp_madd_v2r8(dx10,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy10,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz10,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq20             = _fjsp_mul_v2r8(iq2,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq20,_fjsp_msub_v2r8(rinv20,rinvsq20,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx20,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy20,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz20,fscal,fiz2);
+            
+            fjx0             = _fjsp_madd_v2r8(dx20,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy20,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz20,fscal,fjz0);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* Compute parameters for interactions between i and j atoms */
+            qq30             = _fjsp_mul_v2r8(iq3,jq0);
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq30,_fjsp_msub_v2r8(rinv30,rinvsq30,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx30,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy30,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz30,fscal,fiz3);
+            
+            fjx0             = _fjsp_madd_v2r8(dx30,fscal,fjx0);
+            fjy0             = _fjsp_madd_v2r8(dy30,fscal,fjy0);
+            fjz0             = _fjsp_madd_v2r8(dz30,fscal,fjz0);
+
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+
+            /* Inner loop uses 93 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*93);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_ElecRF_VdwNone_GeomW4W4_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..b6b4e44
--- /dev/null
@@ -0,0 +1,1468 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        PotentialAndForce
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Reset potential sums */
+        velecsum         = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 315 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq11,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq11,rinv11),crf));
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq12,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq12,rinv12),crf));
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq13,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq13,rinv13),crf));
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq21,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq21,rinv21),crf));
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq22,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq22,rinv22),crf));
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq23,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq23,rinv23),crf));
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq31,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq31,rinv31),crf));
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq32,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq32,rinv32),crf));
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq33,_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq33,rinv33),crf));
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 315 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 19 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*315);
+}
+/*
+ * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+ * Electrostatics interaction: ReactionField
+ * VdW interaction:            None
+ * Geometry:                   Water4-Water4
+ * Calculate force/pot:        Force
+ */
+void
+nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    int              vdwioffset1;
+    _fjsp_v2r8       ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
+    int              vdwioffset2;
+    _fjsp_v2r8       ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
+    int              vdwioffset3;
+    _fjsp_v2r8       ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
+    int              vdwjidx1A,vdwjidx1B;
+    _fjsp_v2r8       jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
+    int              vdwjidx2A,vdwjidx2B;
+    _fjsp_v2r8       jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
+    int              vdwjidx3A,vdwjidx3B;
+    _fjsp_v2r8       jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
+    _fjsp_v2r8       dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
+    _fjsp_v2r8       dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
+    _fjsp_v2r8       dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
+    _fjsp_v2r8       dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
+    _fjsp_v2r8       dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
+    _fjsp_v2r8       dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
+    _fjsp_v2r8       dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
+    _fjsp_v2r8       dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
+    _fjsp_v2r8       dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    iq1              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+1]));
+    iq2              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+2]));
+    iq3              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+3]));
+
+    jq1              = gmx_fjsp_set1_v2r8(charge[inr+1]);
+    jq2              = gmx_fjsp_set1_v2r8(charge[inr+2]);
+    jq3              = gmx_fjsp_set1_v2r8(charge[inr+3]);
+    qq11             = _fjsp_mul_v2r8(iq1,jq1);
+    qq12             = _fjsp_mul_v2r8(iq1,jq2);
+    qq13             = _fjsp_mul_v2r8(iq1,jq3);
+    qq21             = _fjsp_mul_v2r8(iq2,jq1);
+    qq22             = _fjsp_mul_v2r8(iq2,jq2);
+    qq23             = _fjsp_mul_v2r8(iq2,jq3);
+    qq31             = _fjsp_mul_v2r8(iq3,jq1);
+    qq32             = _fjsp_mul_v2r8(iq3,jq2);
+    qq33             = _fjsp_mul_v2r8(iq3,jq3);
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
+        fix1             = _fjsp_setzero_v2r8();
+        fiy1             = _fjsp_setzero_v2r8();
+        fiz1             = _fjsp_setzero_v2r8();
+        fix2             = _fjsp_setzero_v2r8();
+        fiy2             = _fjsp_setzero_v2r8();
+        fiz2             = _fjsp_setzero_v2r8();
+        fix3             = _fjsp_setzero_v2r8();
+        fiy3             = _fjsp_setzero_v2r8();
+        fiz3             = _fjsp_setzero_v2r8();
+
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            fscal            = felec;
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        if(jidx<j_index_end)
+        {
+
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+
+            /* Calculate displacement vector */
+            dx11             = _fjsp_sub_v2r8(ix1,jx1);
+            dy11             = _fjsp_sub_v2r8(iy1,jy1);
+            dz11             = _fjsp_sub_v2r8(iz1,jz1);
+            dx12             = _fjsp_sub_v2r8(ix1,jx2);
+            dy12             = _fjsp_sub_v2r8(iy1,jy2);
+            dz12             = _fjsp_sub_v2r8(iz1,jz2);
+            dx13             = _fjsp_sub_v2r8(ix1,jx3);
+            dy13             = _fjsp_sub_v2r8(iy1,jy3);
+            dz13             = _fjsp_sub_v2r8(iz1,jz3);
+            dx21             = _fjsp_sub_v2r8(ix2,jx1);
+            dy21             = _fjsp_sub_v2r8(iy2,jy1);
+            dz21             = _fjsp_sub_v2r8(iz2,jz1);
+            dx22             = _fjsp_sub_v2r8(ix2,jx2);
+            dy22             = _fjsp_sub_v2r8(iy2,jy2);
+            dz22             = _fjsp_sub_v2r8(iz2,jz2);
+            dx23             = _fjsp_sub_v2r8(ix2,jx3);
+            dy23             = _fjsp_sub_v2r8(iy2,jy3);
+            dz23             = _fjsp_sub_v2r8(iz2,jz3);
+            dx31             = _fjsp_sub_v2r8(ix3,jx1);
+            dy31             = _fjsp_sub_v2r8(iy3,jy1);
+            dz31             = _fjsp_sub_v2r8(iz3,jz1);
+            dx32             = _fjsp_sub_v2r8(ix3,jx2);
+            dy32             = _fjsp_sub_v2r8(iy3,jy2);
+            dz32             = _fjsp_sub_v2r8(iz3,jz2);
+            dx33             = _fjsp_sub_v2r8(ix3,jx3);
+            dy33             = _fjsp_sub_v2r8(iy3,jy3);
+            dz33             = _fjsp_sub_v2r8(iz3,jz3);
+
+            /* Calculate squared distance and things based on it */
+            rsq11            = gmx_fjsp_calc_rsq_v2r8(dx11,dy11,dz11);
+            rsq12            = gmx_fjsp_calc_rsq_v2r8(dx12,dy12,dz12);
+            rsq13            = gmx_fjsp_calc_rsq_v2r8(dx13,dy13,dz13);
+            rsq21            = gmx_fjsp_calc_rsq_v2r8(dx21,dy21,dz21);
+            rsq22            = gmx_fjsp_calc_rsq_v2r8(dx22,dy22,dz22);
+            rsq23            = gmx_fjsp_calc_rsq_v2r8(dx23,dy23,dz23);
+            rsq31            = gmx_fjsp_calc_rsq_v2r8(dx31,dy31,dz31);
+            rsq32            = gmx_fjsp_calc_rsq_v2r8(dx32,dy32,dz32);
+            rsq33            = gmx_fjsp_calc_rsq_v2r8(dx33,dy33,dz33);
+
+            rinv11           = gmx_fjsp_invsqrt_v2r8(rsq11);
+            rinv12           = gmx_fjsp_invsqrt_v2r8(rsq12);
+            rinv13           = gmx_fjsp_invsqrt_v2r8(rsq13);
+            rinv21           = gmx_fjsp_invsqrt_v2r8(rsq21);
+            rinv22           = gmx_fjsp_invsqrt_v2r8(rsq22);
+            rinv23           = gmx_fjsp_invsqrt_v2r8(rsq23);
+            rinv31           = gmx_fjsp_invsqrt_v2r8(rsq31);
+            rinv32           = gmx_fjsp_invsqrt_v2r8(rsq32);
+            rinv33           = gmx_fjsp_invsqrt_v2r8(rsq33);
+
+            rinvsq11         = _fjsp_mul_v2r8(rinv11,rinv11);
+            rinvsq12         = _fjsp_mul_v2r8(rinv12,rinv12);
+            rinvsq13         = _fjsp_mul_v2r8(rinv13,rinv13);
+            rinvsq21         = _fjsp_mul_v2r8(rinv21,rinv21);
+            rinvsq22         = _fjsp_mul_v2r8(rinv22,rinv22);
+            rinvsq23         = _fjsp_mul_v2r8(rinv23,rinv23);
+            rinvsq31         = _fjsp_mul_v2r8(rinv31,rinv31);
+            rinvsq32         = _fjsp_mul_v2r8(rinv32,rinv32);
+            rinvsq33         = _fjsp_mul_v2r8(rinv33,rinv33);
+
+            fjx1             = _fjsp_setzero_v2r8();
+            fjy1             = _fjsp_setzero_v2r8();
+            fjz1             = _fjsp_setzero_v2r8();
+            fjx2             = _fjsp_setzero_v2r8();
+            fjy2             = _fjsp_setzero_v2r8();
+            fjz2             = _fjsp_setzero_v2r8();
+            fjx3             = _fjsp_setzero_v2r8();
+            fjy3             = _fjsp_setzero_v2r8();
+            fjz3             = _fjsp_setzero_v2r8();
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq11,_fjsp_msub_v2r8(rinv11,rinvsq11,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx11,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy11,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz11,fscal,fiz1);
+            
+            fjx1             = _fjsp_madd_v2r8(dx11,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy11,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz11,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq12,_fjsp_msub_v2r8(rinv12,rinvsq12,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx12,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy12,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz12,fscal,fiz1);
+            
+            fjx2             = _fjsp_madd_v2r8(dx12,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy12,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz12,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq13,_fjsp_msub_v2r8(rinv13,rinvsq13,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix1             = _fjsp_madd_v2r8(dx13,fscal,fix1);
+            fiy1             = _fjsp_madd_v2r8(dy13,fscal,fiy1);
+            fiz1             = _fjsp_madd_v2r8(dz13,fscal,fiz1);
+            
+            fjx3             = _fjsp_madd_v2r8(dx13,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy13,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz13,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq21,_fjsp_msub_v2r8(rinv21,rinvsq21,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx21,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy21,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz21,fscal,fiz2);
+            
+            fjx1             = _fjsp_madd_v2r8(dx21,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy21,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz21,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq22,_fjsp_msub_v2r8(rinv22,rinvsq22,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx22,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy22,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz22,fscal,fiz2);
+            
+            fjx2             = _fjsp_madd_v2r8(dx22,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy22,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz22,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq23,_fjsp_msub_v2r8(rinv23,rinvsq23,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix2             = _fjsp_madd_v2r8(dx23,fscal,fix2);
+            fiy2             = _fjsp_madd_v2r8(dy23,fscal,fiy2);
+            fiz2             = _fjsp_madd_v2r8(dz23,fscal,fiz2);
+            
+            fjx3             = _fjsp_madd_v2r8(dx23,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy23,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz23,fscal,fjz3);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq31,_fjsp_msub_v2r8(rinv31,rinvsq31,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx31,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy31,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz31,fscal,fiz3);
+            
+            fjx1             = _fjsp_madd_v2r8(dx31,fscal,fjx1);
+            fjy1             = _fjsp_madd_v2r8(dy31,fscal,fjy1);
+            fjz1             = _fjsp_madd_v2r8(dz31,fscal,fjz1);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq32,_fjsp_msub_v2r8(rinv32,rinvsq32,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx32,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy32,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz32,fscal,fiz3);
+            
+            fjx2             = _fjsp_madd_v2r8(dx32,fscal,fjx2);
+            fjy2             = _fjsp_madd_v2r8(dy32,fscal,fjy2);
+            fjz2             = _fjsp_madd_v2r8(dz32,fscal,fjz2);
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            felec            = _fjsp_mul_v2r8(qq33,_fjsp_msub_v2r8(rinv33,rinvsq33,krf2));
+
+            fscal            = felec;
+
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+
+            /* Update vectorial force */
+            fix3             = _fjsp_madd_v2r8(dx33,fscal,fix3);
+            fiy3             = _fjsp_madd_v2r8(dy33,fscal,fiy3);
+            fiz3             = _fjsp_madd_v2r8(dz33,fscal,fiz3);
+            
+            fjx3             = _fjsp_madd_v2r8(dx33,fscal,fjx3);
+            fjy3             = _fjsp_madd_v2r8(dy33,fscal,fjy3);
+            fjz3             = _fjsp_madd_v2r8(dz33,fscal,fjz3);
+
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+
+            /* Inner loop uses 270 flops */
+        }
+
+        /* End of innermost loop */
+
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses 18 flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*270);
+}
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.c b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.c
new file mode 100644 (file)
index 0000000..5e07e4b
--- /dev/null
@@ -0,0 +1,481 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*
+ * Note: this file was generated by the GROMACS sparc64_hpc_ace_double kernel generator.
+ */
+#ifndef nb_kernel_sparc64_hpc_ace_double_h
+#define nb_kernel_sparc64_hpc_ace_double_h
+
+#include "../nb_kernel.h"
+
+nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double;
+nb_kernel_t nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double;
+
+
+nb_kernel_info_t
+kernellist_sparc64_hpc_ace_double[] =
+{
+    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "None", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialShift", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Ewald", "PotentialSwitch", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "Coulomb", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "CubicSplineTable", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "GeneralizedBorn", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "GeneralizedBorn", "None", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "GeneralizedBorn", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialShift", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "LennardJones", "PotentialSwitch", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "ExactCutoff", "CubicSplineTable", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "LennardJones", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "None", "None", "Water4Water4", "", "Force" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "ParticleParticle", "", "Force" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Particle", "", "Force" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water3Water3", "", "Force" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Particle", "", "Force" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "PotentialAndForce" },
+    { nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double, "nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sparc64_hpc_ace_double", "sparc64_hpc_ace_double", "ReactionField", "None", "CubicSplineTable", "None", "Water4Water4", "", "Force" }
+};
+
+int
+kernellist_sparc64_hpc_ace_double_size = sizeof(kernellist_sparc64_hpc_ace_double)/sizeof(kernellist_sparc64_hpc_ace_double[0]);
+
+#endif
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h
new file mode 100644 (file)
index 0000000..afb925b
--- /dev/null
@@ -0,0 +1,36 @@
+/*
+ * Note: this file was generated by the Gromacs c kernel generator.
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ *
+ * Gromacs is a library for molecular simulation and trajectory analysis,
+ * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
+ * a full list of developers and information, check out http://www.gromacs.org
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * To help fund GROMACS development, we humbly ask that you cite
+ * the papers people have written on it - you can find them on the website.
+ */
+#ifndef nb_kernel_sparc64_hpc_ace_double_h
+#define nb_kernel_sparc64_hpc_ace_double_h
+
+#include "../nb_kernel.h"
+
+
+/* List of kernels for this architecture with metadata about them */
+extern nb_kernel_info_t
+kernellist_sparc64_hpc_ace_double[];
+
+/* Length of kernellist_c */
+extern int
+kernellist_sparc64_hpc_ace_double_size;
+
+#endif
diff --git a/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_template_sparc64_hpc_ace_double.pre b/src/gmxlib/nonbonded/nb_kernel_sparc64_hpc_ace_double/nb_kernel_template_sparc64_hpc_ace_double.pre
new file mode 100644 (file)
index 0000000..1349445
--- /dev/null
@@ -0,0 +1,1086 @@
+/* ## */
+/* ## This file is part of the GROMACS molecular simulation package. */
+/* ## */
+/* ## Copyright (c) 2012, by the GROMACS development team, led by */
+/* ## David van der Spoel, Berk Hess, Erik Lindahl, and including many */
+/* ## others, as listed in the AUTHORS file in the top-level source */
+/* ## directory and at http://www.gromacs.org. */
+/* ## */
+/* ## GROMACS is free software; you can redistribute it and/or */
+/* ## modify it under the terms of the GNU Lesser General Public License */
+/* ## as published by the Free Software Foundation; either version 2.1 */
+/* ## of the License, or (at your option) any later version. */
+/* ## */
+/* ## GROMACS is distributed in the hope that it will be useful, */
+/* ## but WITHOUT ANY WARRANTY; without even the implied warranty of */
+/* ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU */
+/* ## Lesser General Public License for more details. */
+/* ## */
+/* ## You should have received a copy of the GNU Lesser General Public */
+/* ## License along with GROMACS; if not, see */
+/* ## http://www.gnu.org/licenses, or write to the Free Software Foundation, */
+/* ## Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA. */
+/* ## */
+/* ## If you want to redistribute modifications to GROMACS, please */
+/* ## consider that scientific software is very special. Version */
+/* ## control is crucial - bugs must be traceable. We will be happy to */
+/* ## consider code for inclusion in the official distribution, but */
+/* ## derived work must not be called official GROMACS. Details are found */
+/* ## in the README & COPYING files - if they are missing, get the */
+/* ## official version at http://www.gromacs.org. */
+/* ## */
+/* ## To help us fund GROMACS development, we humbly ask that you cite */
+/* ## the research papers on the package. Check out http://www.gromacs.org. */
+/* ## */
+/* #if 0 */
+#error This file must be processed with the Gromacs pre-preprocessor
+/* #endif */
+/* #if INCLUDE_HEADER */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "../nb_kernel.h"
+#include "types/simple.h"
+#include "vec.h"
+#include "nrnb.h"
+
+#include "kernelutil_sparc64_hpc_ace_double.h"
+/* #endif */
+
+/* ## List of variables set by the generating script:                                    */
+/* ##                                                                                    */
+/* ## Setttings that apply to the entire kernel:                                         */
+/* ## KERNEL_ELEC:           String, choice for electrostatic interactions               */
+/* ## KERNEL_VDW:            String, choice for van der Waals interactions               */
+/* ## KERNEL_NAME:           String, name of this kernel                                 */
+/* ## KERNEL_VF:             String telling if we calculate potential, force, or both    */
+/* ## GEOMETRY_I/GEOMETRY_J: String, name of each geometry, e.g. 'Water3' or '1Particle' */
+/* ##                                                                                    */
+/* ## Setttings that apply to particles in the outer (I) or inner (J) loops:             */
+/* ## PARTICLES_I[]/         Arrays with lists of i/j particles to use in kernel. It is  */
+/* ## PARTICLES_J[]:         just [0] for particle geometry, but can be longer for water */
+/* ## PARTICLES_ELEC_I[]/    Arrays with lists of i/j particle that have electrostatics  */
+/* ## PARTICLES_ELEC_J[]:    interactions that should be calculated in this kernel.      */
+/* ## PARTICLES_VDW_I[]/     Arrays with the list of i/j particle that have VdW          */
+/* ## PARTICLES_VDW_J[]:     interactions that should be calculated in this kernel.      */
+/* ##                                                                                    */
+/* ## Setttings for pairs of interactions (e.g. 2nd i particle against 1st j particle)   */
+/* ## PAIRS_IJ[]:            Array with (i,j) tuples of pairs for which interactions     */
+/* ##                        should be calculated in this kernel. Zero-charge particles  */
+/* ##                        do not have interactions with particles without vdw, and    */
+/* ##                        Vdw-only interactions are not evaluated in a no-vdw-kernel. */
+/* ## INTERACTION_FLAGS[][]: 2D matrix, dimension e.g. 3*3 for water-water interactions. */
+/* ##                        For each i-j pair, the element [I][J] is a list of strings  */
+/* ##                        defining properties/flags of this interaction. Examples     */
+/* ##                        include 'electrostatics'/'vdw' if that type of interaction  */
+/* ##                        should be evaluated, 'rsq'/'rinv'/'rinvsq' if those values  */
+/* ##                        are needed, and 'exactcutoff' or 'shift','switch' to        */
+/* ##                        decide if the force/potential should be modified. This way  */
+/* ##                        we only calculate values absolutely needed for each case.   */
+
+/* ## Calculate the size and offset for (merged/interleaved) table data */
+
+/*
+ * Gromacs nonbonded kernel:   {KERNEL_NAME}
+ * Electrostatics interaction: {KERNEL_ELEC}
+ * VdW interaction:            {KERNEL_VDW}
+ * Geometry:                   {GEOMETRY_I}-{GEOMETRY_J}
+ * Calculate force/pot:        {KERNEL_VF}
+ */
+void
+{KERNEL_NAME}
+                    (t_nblist * gmx_restrict                nlist,
+                     rvec * gmx_restrict                    xx,
+                     rvec * gmx_restrict                    ff,
+                     t_forcerec * gmx_restrict              fr,
+                     t_mdatoms * gmx_restrict               mdatoms,
+                     nb_kernel_data_t * gmx_restrict        kernel_data,
+                     t_nrnb * gmx_restrict                  nrnb)
+{
+    /* ## Not all variables are used for all kernels, but any optimizing compiler fixes that, */
+    /* ## so there is no point in going to extremes to exclude variables that are not needed. */
+    /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
+     * just 0 for non-waters.
+     * Suffixes A,B refer to j loop unrolling done with double precision SIMD, e.g. for the two different
+     * jnr indices corresponding to data put in the four positions in the SIMD register.
+     */
+    int              i_shift_offset,i_coord_offset,outeriter,inneriter;
+    int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
+    int              jnrA,jnrB;
+    int              j_coord_offsetA,j_coord_offsetB;
+    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
+    real             rcutoff_scalar;
+    real             *shiftvec,*fshift,*x,*f;
+    _fjsp_v2r8       tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
+    /* #for I in PARTICLES_I */
+    int              vdwioffset{I};
+    _fjsp_v2r8       ix{I},iy{I},iz{I},fix{I},fiy{I},fiz{I},iq{I},isai{I};
+    /* #endfor */
+    /* #for J in PARTICLES_J */
+    int              vdwjidx{J}A,vdwjidx{J}B;
+    _fjsp_v2r8       jx{J},jy{J},jz{J},fjx{J},fjy{J},fjz{J},jq{J},isaj{J};
+    /* #endfor */
+    /* #for I,J in PAIRS_IJ */
+    _fjsp_v2r8       dx{I}{J},dy{I}{J},dz{I}{J},rsq{I}{J},rinv{I}{J},rinvsq{I}{J},r{I}{J},qq{I}{J},c6_{I}{J},c12_{I}{J};
+    /* #endfor */
+    /* #if KERNEL_ELEC != 'None' */
+    _fjsp_v2r8       velec,felec,velecsum,facel,crf,krf,krf2;
+    real             *charge;
+    /* #endif */
+    /* #if 'GeneralizedBorn' in KERNEL_ELEC */
+    _fjsp_v2r8       vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,twogbeps,dvdatmp;
+    _fjsp_v2r8       minushalf = gmx_fjsp_set1_v2r8(-0.5);
+    real             *invsqrta,*dvda,*gbtab;
+    /* #endif */
+    /* #if KERNEL_VDW != 'None' */
+    int              nvdwtype;
+    _fjsp_v2r8       rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
+    int              *vdwtype;
+    real             *vdwparam;
+    _fjsp_v2r8       one_sixth   = gmx_fjsp_set1_v2r8(1.0/6.0);
+    _fjsp_v2r8       one_twelfth = gmx_fjsp_set1_v2r8(1.0/12.0);
+    /* #endif */
+    /* #if 'Table' in KERNEL_ELEC or 'GeneralizedBorn' in KERNEL_ELEC or 'Table' in KERNEL_VDW */
+    _fjsp_v2r8       rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
+    real             *vftab;
+    /* #endif */
+    /* #if 'Ewald' in KERNEL_ELEC */
+    _fjsp_v2r8       ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
+    real             *ewtab;
+    /* #endif */
+    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
+    _fjsp_v2r8       rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
+    real             rswitch_scalar,d_scalar;
+    /* #endif */
+    _fjsp_v2r8       itab_tmp;
+    _fjsp_v2r8       dummy_mask,cutoff_mask;
+    _fjsp_v2r8       one     = gmx_fjsp_set1_v2r8(1.0);
+    _fjsp_v2r8       two     = gmx_fjsp_set1_v2r8(2.0);
+    union { _fjsp_v2r8 simd; long long int i[2]; } vfconv,gbconv,ewconv;
+
+    x                = xx[0];
+    f                = ff[0];
+
+    nri              = nlist->nri;
+    iinr             = nlist->iinr;
+    jindex           = nlist->jindex;
+    jjnr             = nlist->jjnr;
+    shiftidx         = nlist->shift;
+    gid              = nlist->gid;
+    shiftvec         = fr->shift_vec[0];
+    fshift           = fr->fshift[0];
+    /* #if KERNEL_ELEC != 'None' */
+    facel            = gmx_fjsp_set1_v2r8(fr->epsfac);
+    charge           = mdatoms->chargeA;
+    /*     #if 'ReactionField' in KERNEL_ELEC */
+    krf              = gmx_fjsp_set1_v2r8(fr->ic->k_rf);
+    krf2             = gmx_fjsp_set1_v2r8(fr->ic->k_rf*2.0);
+    crf              = gmx_fjsp_set1_v2r8(fr->ic->c_rf);
+    /*     #endif */
+    /* #endif */
+    /* #if KERNEL_VDW != 'None' */
+    nvdwtype         = fr->ntype;
+    vdwparam         = fr->nbfp;
+    vdwtype          = mdatoms->typeA;
+    /* #endif */
+
+    /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */
+    vftab            = kernel_data->table_elec_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec_vdw->scale);
+    /* #elif 'Table' in KERNEL_ELEC */
+    vftab            = kernel_data->table_elec->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_elec->scale);
+    /* #elif 'Table' in KERNEL_VDW */
+    vftab            = kernel_data->table_vdw->data;
+    vftabscale       = gmx_fjsp_set1_v2r8(kernel_data->table_vdw->scale);
+    /* #endif */
+
+    /* #if 'Ewald' in KERNEL_ELEC */
+    sh_ewald         = gmx_fjsp_set1_v2r8(fr->ic->sh_ewald);
+    /*     #if KERNEL_VF=='Force' and KERNEL_MOD_ELEC!='PotentialSwitch' */
+    ewtab            = fr->ic->tabq_coul_F;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+    /*     #else */
+    ewtab            = fr->ic->tabq_coul_FDV0;
+    ewtabscale       = gmx_fjsp_set1_v2r8(fr->ic->tabq_scale);
+    ewtabhalfspace   = gmx_fjsp_set1_v2r8(0.5/fr->ic->tabq_scale);
+     /*     #endif */
+    /* #endif */
+
+    /* #if KERNEL_ELEC=='GeneralizedBorn' */
+    invsqrta         = fr->invsqrta;
+    dvda             = fr->dvda;
+    gbtabscale       = gmx_fjsp_set1_v2r8(fr->gbtab.scale);
+    gbtab            = fr->gbtab.data;
+    gbinvepsdiff     = gmx_fjsp_set1_v2r8((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
+    /* #endif */
+
+    /* #if 'Water' in GEOMETRY_I */
+    /* Setup water-specific parameters */
+    inr              = nlist->iinr[0];
+    /*     #for I in PARTICLES_ELEC_I */
+    iq{I}              = _fjsp_mul_v2r8(facel,gmx_fjsp_set1_v2r8(charge[inr+{I}]));
+    /*     #endfor */
+    /*     #for I in PARTICLES_VDW_I */
+    vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
+    /*     #endfor */
+    /* #endif */
+
+    /* #if 'Water' in GEOMETRY_J */
+    /*     #for J in PARTICLES_ELEC_J */
+    jq{J}              = gmx_fjsp_set1_v2r8(charge[inr+{J}]);
+    /*     #endfor */
+    /*     #for J in PARTICLES_VDW_J */
+    vdwjidx{J}A        = 2*vdwtype[inr+{J}];
+    /*     #endfor */
+    /*     #for I,J in PAIRS_IJ */
+    /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
+    qq{I}{J}             = _fjsp_mul_v2r8(iq{I},jq{J});
+    /*         #endif */
+    /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+    c6_{I}{J}            = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset{I}+vdwjidx{J}A]);
+    c12_{I}{J}           = gmx_fjsp_set1_v2r8(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]);
+    /*         #endif */
+    /*     #endfor */
+    /* #endif */
+
+    /* #if KERNEL_MOD_ELEC!='None' or KERNEL_MOD_VDW!='None' */
+    /*     #if KERNEL_ELEC!='None' */
+    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
+    rcutoff_scalar   = fr->rcoulomb;
+    /*     #else */
+    rcutoff_scalar   = fr->rvdw;
+    /*     #endif */
+    rcutoff          = gmx_fjsp_set1_v2r8(rcutoff_scalar);
+    rcutoff2         = _fjsp_mul_v2r8(rcutoff,rcutoff);
+    /* #endif */
+
+    /* #if KERNEL_MOD_VDW=='PotentialShift' */
+    sh_vdw_invrcut6  = gmx_fjsp_set1_v2r8(fr->ic->sh_invrc6);
+    rvdw             = gmx_fjsp_set1_v2r8(fr->rvdw);
+    /* #endif */
+
+    /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */
+    /*     #if KERNEL_MOD_ELEC=='PotentialSwitch'  */
+    rswitch_scalar   = fr->rcoulomb_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /*     #else */
+    rswitch_scalar   = fr->rvdw_switch;
+    rswitch          = gmx_fjsp_set1_v2r8(rswitch_scalar);
+    /*     #endif */
+    /* Setup switch parameters */
+    d_scalar         = rcutoff_scalar-rswitch_scalar;
+    d                = gmx_fjsp_set1_v2r8(d_scalar);
+    swV3             = gmx_fjsp_set1_v2r8(-10.0/(d_scalar*d_scalar*d_scalar));
+    swV4             = gmx_fjsp_set1_v2r8( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swV5             = gmx_fjsp_set1_v2r8( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    /*     #if 'Force' in KERNEL_VF */
+    swF2             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar));
+    swF3             = gmx_fjsp_set1_v2r8( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
+    swF4             = gmx_fjsp_set1_v2r8(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
+    /*     #endif */
+    /* #endif */
+
+    /* Avoid stupid compiler warnings */
+    jnrA = jnrB = 0;
+    j_coord_offsetA = 0;
+    j_coord_offsetB = 0;
+
+    /* ## Keep track of the floating point operations we issue for reporting! */
+    /* #define OUTERFLOPS 0 */
+    outeriter        = 0;
+    inneriter        = 0;
+
+    /* Start outer loop over neighborlists */
+    for(iidx=0; iidx<nri; iidx++)
+    {
+        /* Load shift vector for this list */
+        i_shift_offset   = DIM*shiftidx[iidx];
+
+        /* Load limits for loop over neighbors */
+        j_index_start    = jindex[iidx];
+        j_index_end      = jindex[iidx+1];
+
+        /* Get outer coordinate index */
+        inr              = iinr[iidx];
+        i_coord_offset   = DIM*inr;
+
+        /* Load i particle coords and add shift vector */
+        /* #if GEOMETRY_I == 'Particle' */
+        gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+        /* #elif GEOMETRY_I == 'Water3' */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+        /* #elif GEOMETRY_I == 'Water4' */
+        /*     #if 0 in PARTICLES_I                 */
+        gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+        /*     #else                                */
+        gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+        /*     #endif                               */
+        /* #endif                                   */
+
+        /* #if 'Force' in KERNEL_VF */
+        /*     #for I in PARTICLES_I */
+        fix{I}             = _fjsp_setzero_v2r8();
+        fiy{I}             = _fjsp_setzero_v2r8();
+        fiz{I}             = _fjsp_setzero_v2r8();
+        /*     #endfor */
+        /* #endif */
+
+        /* ## For water we already preloaded parameters at the start of the kernel */
+        /* #if not 'Water' in GEOMETRY_I */
+        /* Load parameters for i particles */
+        /*     #for I in PARTICLES_ELEC_I */
+        iq{I}              = _fjsp_mul_v2r8(facel,gmx_fjsp_load1_v2r8(charge+inr+{I}));
+        /*         #define OUTERFLOPS OUTERFLOPS+1 */
+        /*         #if KERNEL_ELEC=='GeneralizedBorn' */
+        isai{I}            = gmx_fjsp_load1_v2r8(invsqrta+inr+{I});
+        /*         #endif */
+        /*     #endfor */
+        /*     #for I in PARTICLES_VDW_I */
+        vdwioffset{I}      = 2*nvdwtype*vdwtype[inr+{I}];
+        /*     #endfor */
+        /* #endif */
+
+        /* #if 'Potential' in KERNEL_VF */
+        /* Reset potential sums */
+        /*     #if KERNEL_ELEC != 'None' */
+        velecsum         = _fjsp_setzero_v2r8();
+        /*     #endif */
+        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
+        vgbsum           = _fjsp_setzero_v2r8();
+        /*     #endif */
+        /*     #if KERNEL_VDW != 'None' */
+        vvdwsum          = _fjsp_setzero_v2r8();
+        /*     #endif */
+        /* #endif */
+        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
+        dvdasum          = _fjsp_setzero_v2r8();
+        /*     #endif */
+
+        /* #for ROUND in ['Loop','Epilogue'] */
+
+        /* #if ROUND =='Loop' */
+        /* Start inner kernel loop */
+        for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
+        {
+        /* ## First round is normal loop (next statement resets indentation) */
+        /*     #if 0 */
+        }
+        /*     #endif */
+        /* #else */
+        if(jidx<j_index_end)
+        {
+        /* ## Second round is epilogue */
+        /* #endif */
+        /* #define INNERFLOPS 0 */
+
+            /* #if ROUND =='Loop' */
+            /* Get j neighbor index, and coordinate index */
+            jnrA             = jjnr[jidx];
+            jnrB             = jjnr[jidx+1];
+            j_coord_offsetA  = DIM*jnrA;
+            j_coord_offsetB  = DIM*jnrB;
+
+            /* load j atom coordinates */
+            /*     #if GEOMETRY_J == 'Particle'             */
+            gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0);
+            /*     #elif GEOMETRY_J == 'Water3'             */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+            /*     #elif GEOMETRY_J == 'Water4'             */
+            /*         #if 0 in PARTICLES_J                 */
+            gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA,x+j_coord_offsetB,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+            /*         #else                                */
+            gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+            /*         #endif                               */
+            /*     #endif                                   */
+            /* #else */
+            jnrA             = jjnr[jidx];
+            j_coord_offsetA  = DIM*jnrA;
+
+            /* load j atom coordinates */
+            /*     #if GEOMETRY_J == 'Particle'             */
+            gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0);
+            /*     #elif GEOMETRY_J == 'Water3'             */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
+            /*     #elif GEOMETRY_J == 'Water4'             */
+            /*         #if 0 in PARTICLES_J                 */
+            gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA,
+                                              &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
+                                              &jy2,&jz2,&jx3,&jy3,&jz3);
+            /*         #else                                */
+            gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(x+j_coord_offsetA+DIM,
+                                              &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
+            /*         #endif                               */
+            /*     #endif                                   */
+            /* #endif */
+
+            /* Calculate displacement vector */
+            /* #for I,J in PAIRS_IJ */
+            dx{I}{J}             = _fjsp_sub_v2r8(ix{I},jx{J});
+            dy{I}{J}             = _fjsp_sub_v2r8(iy{I},jy{J});
+            dz{I}{J}             = _fjsp_sub_v2r8(iz{I},jz{J});
+            /*     #define INNERFLOPS INNERFLOPS+3 */
+            /* #endfor */
+
+            /* Calculate squared distance and things based on it */
+            /* #for I,J in PAIRS_IJ */
+            rsq{I}{J}            = gmx_fjsp_calc_rsq_v2r8(dx{I}{J},dy{I}{J},dz{I}{J});
+            /*     #define INNERFLOPS INNERFLOPS+5 */
+            /* #endfor */
+
+            /* #for I,J in PAIRS_IJ */
+            /*     #if 'rinv' in INTERACTION_FLAGS[I][J] */
+            rinv{I}{J}           = gmx_fjsp_invsqrt_v2r8(rsq{I}{J});
+            /*         #define INNERFLOPS INNERFLOPS+5 */
+            /*     #endif */
+            /* #endfor */
+
+            /* #for I,J in PAIRS_IJ */
+            /*     #if 'rinvsq' in INTERACTION_FLAGS[I][J] */
+            /*         # if 'rinv' not in INTERACTION_FLAGS[I][J] */
+            rinvsq{I}{J}         = gmx_fjsp_inv_v2r8(rsq{I}{J});
+            /*             #define INNERFLOPS INNERFLOPS+4 */
+            /*         #else */
+            rinvsq{I}{J}         = _fjsp_mul_v2r8(rinv{I}{J},rinv{I}{J});
+            /*             #define INNERFLOPS INNERFLOPS+1 */
+            /*         #endif */
+            /*     #endif */
+            /* #endfor */
+
+            /* #if not 'Water' in GEOMETRY_J */
+            /* Load parameters for j particles */
+            /*     #for J in PARTICLES_ELEC_J */
+            /*         #if ROUND =='Loop' */
+            jq{J}              = gmx_fjsp_load_2real_swizzle_v2r8(charge+jnrA+{J},charge+jnrB+{J});
+            /*         #else */
+            jq{J}              = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),charge+jnrA+{J});
+            /*         #endif */
+            /*         #if KERNEL_ELEC=='GeneralizedBorn' */
+            /*             #if ROUND =='Loop' */
+            isaj{J}            = gmx_fjsp_load_2real_swizzle_v2r8(invsqrta+jnrA+{J},invsqrta+jnrB+{J});
+            /*             #else */
+            isaj{J}            = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),invsqrta+jnrA+{J});
+            /*             #endif */
+            /*         #endif */
+            /*     #endfor */
+            /*     #for J in PARTICLES_VDW_J */
+            vdwjidx{J}A        = 2*vdwtype[jnrA+{J}];
+            /*         #if ROUND =='Loop' */
+            vdwjidx{J}B        = 2*vdwtype[jnrB+{J}];
+            /*         #endif */
+            /*     #endfor */
+            /* #endif */
+
+            /* #if 'Force' in KERNEL_VF and not 'Particle' in GEOMETRY_I */
+            /*     #for J in PARTICLES_J */
+            fjx{J}             = _fjsp_setzero_v2r8();
+            fjy{J}             = _fjsp_setzero_v2r8();
+            fjz{J}             = _fjsp_setzero_v2r8();
+            /*     #endfor */
+            /* #endif */
+
+            /* #for I,J in PAIRS_IJ */
+
+            /**************************
+             * CALCULATE INTERACTIONS *
+             **************************/
+
+            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            /*         ## We always calculate rinv/rinvsq above to enable pipelineing in compilers (performance tested on x86) */
+            if (gmx_fjsp_any_lt_v2r8(rsq{I}{J},rcutoff2))
+            {
+                /*     #if 0    ## this and the next two lines is a hack to maintain auto-indentation in template file */
+            }
+            /*         #endif */
+            /*         #define INNERFLOPS INNERFLOPS+1 */
+            /*     #endif */
+
+            /*     #if 'r' in INTERACTION_FLAGS[I][J] */
+            r{I}{J}              = _fjsp_mul_v2r8(rsq{I}{J},rinv{I}{J});
+             /*         #define INNERFLOPS INNERFLOPS+1 */
+            /*     #endif */
+
+            /*     ## For water geometries we already loaded parameters at the start of the kernel */
+            /*     #if not 'Water' in GEOMETRY_J */
+            /* Compute parameters for interactions between i and j atoms */
+            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
+            qq{I}{J}             = _fjsp_mul_v2r8(iq{I},jq{J});
+            /*             #define INNERFLOPS INNERFLOPS+1 */
+            /*         #endif */
+            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+            /*             #if ROUND == 'Loop' */
+            gmx_fjsp_load_2pair_swizzle_v2r8(vdwparam+vdwioffset{I}+vdwjidx{J}A,
+                                         vdwparam+vdwioffset{I}+vdwjidx{J}B,&c6_{I}{J},&c12_{I}{J});
+            /*             #else */
+            gmx_fjsp_load_1pair_swizzle_v2r8(vdwparam+vdwioffset{I}+vdwjidx{J}A,&c6_{I}{J},&c12_{I}{J});
+            /*             #endif */
+            /*         #endif */
+            /*     #endif */
+
+            /*     #if 'table' in INTERACTION_FLAGS[I][J] */
+            /* Calculate table index by multiplying r with table scale and truncate to integer */
+            rt               = _fjsp_mul_v2r8(r{I}{J},vftabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            vfeps            = _fjsp_sub_v2r8(rt, _fjsp_xtod_v2r8(itab_tmp));
+            twovfeps         = _fjsp_add_v2r8(vfeps,vfeps);
+            _fjsp_store_v2r8(&vfconv.simd,itab_tmp);
+
+            /*         #define INNERFLOPS INNERFLOPS+4                          */
+            /*         #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW     */
+            /*             ## 3 tables, 4 data per point: multiply index by 12 */
+            vfconv.i[0]     *= 12;
+            vfconv.i[1]     *= 12;
+            /*         #elif 'Table' in KERNEL_ELEC                             */
+            /*             ## 1 table, 4 data per point: multiply index by 4   */
+            vfconv.i[0]     *= 4;
+            vfconv.i[1]     *= 4;
+            /*         #elif 'Table' in KERNEL_VDW                              */
+            /*             ## 2 tables, 4 data per point: multiply index by 8  */
+            vfconv.i[0]     *= 8;
+            vfconv.i[1]     *= 8;
+            /*         #endif                                                   */
+            /*     #endif */
+
+            /*     ## ELECTROSTATIC INTERACTIONS */
+            /*     #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
+
+            /*         #if KERNEL_ELEC=='Coulomb' */
+
+            /* COULOMB ELECTROSTATICS */
+            velec            = _fjsp_mul_v2r8(qq{I}{J},rinv{I}{J});
+            /*             #define INNERFLOPS INNERFLOPS+1 */
+            /*             #if 'Force' in KERNEL_VF */
+            felec            = _fjsp_mul_v2r8(velec,rinvsq{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+2 */
+            /*             #endif */
+
+            /*         #elif KERNEL_ELEC=='ReactionField' */
+
+            /* REACTION-FIELD ELECTROSTATICS */
+            /*             #if 'Potential' in KERNEL_VF */
+            velec            = _fjsp_mul_v2r8(qq{I}{J},_fjsp_sub_v2r8(_fjsp_madd_v2r8(krf,rsq{I}{J},rinv{I}{J}),crf));
+            /*                 #define INNERFLOPS INNERFLOPS+4 */
+            /*             #endif */
+            /*             #if 'Force' in KERNEL_VF */
+            felec            = _fjsp_mul_v2r8(qq{I}{J},_fjsp_msub_v2r8(rinv{I}{J},rinvsq{I}{J},krf2));
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+            /*             #endif */
+
+            /*         #elif KERNEL_ELEC=='GeneralizedBorn' */
+
+            /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
+            isaprod          = _fjsp_mul_v2r8(isai{I},isaj{J});
+            gbqqfactor       = _fjsp_neg_v2r8(_fjsp_mul_v2r8(qq{I}{J},_fjsp_mul_v2r8(isaprod,gbinvepsdiff)));
+            gbscale          = _fjsp_mul_v2r8(isaprod,gbtabscale);
+            /*             #define INNERFLOPS INNERFLOPS+5 */
+
+            /* Calculate generalized born table index - this is a separate table from the normal one,
+             * but we use the same procedure by multiplying r with scale and truncating to integer.
+             */
+            rt               = _fjsp_mul_v2r8(r{I}{J},gbscale);
+            itab_tmp         = _fjsp_dtox_v2r8(rt);
+            gbeps            = _fjsp_sub_v2r8(rt,_fjsp_xtod_v2r8(itab_tmp));
+            _fjsp_store_v2r8(&gbconv.simd,itab_tmp);
+
+            Y                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] );
+            /*             #if ROUND == 'Loop' */
+            F                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] );
+            /*             #else */
+            F                = _fjsp_setzero_v2r8();
+            /*             #endif */
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[0] +2);
+            /*             #if ROUND == 'Loop' */
+            H                = _fjsp_load_v2r8( gbtab + 4*gbconv.i[1] +2);
+            /*             #else */
+            H                = _fjsp_setzero_v2r8();
+            /*             #endif */
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(gbeps,_fjsp_madd_v2r8(gbeps,H,G),F);
+            VV               = _fjsp_madd_v2r8(gbeps,Fp,Y);
+            vgb              = _fjsp_mul_v2r8(gbqqfactor,VV);
+            /*             #define INNERFLOPS INNERFLOPS+10 */
+
+            /*             #if 'Force' in KERNEL_VF */
+            twogbeps         = _fjsp_add_v2r8(gbeps,gbeps);
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twogbeps,H,G),gbeps,Fp);
+            fgb              = _fjsp_mul_v2r8(gbqqfactor,_fjsp_mul_v2r8(FF,gbscale));
+            dvdatmp          = _fjsp_mul_v2r8(minushalf,_fjsp_madd_v2r8(fgb,r{I}{J},vgb));
+            dvdasum          = _fjsp_add_v2r8(dvdasum,dvdatmp);
+            /*             #if ROUND == 'Loop' */
+            gmx_fjsp_increment_2real_swizzle_v2r8(dvda+jnrA,dvda+jnrB,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj{J},isaj{J})));
+            /*             #else */
+            gmx_fjsp_increment_1real_v2r8(dvda+jnrA,_fjsp_mul_v2r8(dvdatmp,_fjsp_mul_v2r8(isaj{J},isaj{J})));
+            /*             #endif */
+            /*                 #define INNERFLOPS INNERFLOPS+13 */
+            /*             #endif */
+            velec            = _fjsp_mul_v2r8(qq{I}{J},rinv{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #if 'Force' in KERNEL_VF */
+            felec            = _fjsp_mul_v2r8(_fjsp_msub_v2r8(velec,rinv{I}{J},fgb),rinv{I}{J});
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+            /*             #endif */
+
+            /*         #elif KERNEL_ELEC=='Ewald' */
+            /* EWALD ELECTROSTATICS */
+
+            /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
+            ewrt             = _fjsp_mul_v2r8(r{I}{J},ewtabscale);
+            itab_tmp         = _fjsp_dtox_v2r8(ewrt);
+            eweps            = _fjsp_sub_v2r8(ewrt,_fjsp_xtod_v2r8(itab_tmp));
+           _fjsp_store_v2r8(&ewconv.simd,itab_tmp);
+
+            /*             #define INNERFLOPS INNERFLOPS+4 */
+            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_ELEC=='PotentialSwitch' */
+            ewtabF           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[0] );
+            /*                 #if ROUND == 'Loop' */
+            ewtabD           = _fjsp_load_v2r8( ewtab + 4*ewconv.i[1] );
+            /*                 #else */
+            ewtabD           = _fjsp_setzero_v2r8();
+            /*                 #endif */
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabF,ewtabD);
+            ewtabV           = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[0] +2);
+            /*                 #if ROUND == 'Loop' */
+            ewtabFn          = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ewtab + 4*ewconv.i[1] +2);
+            /*                 #else */
+            ewtabFn          = _fjsp_setzero_v2r8();
+            /*                 #endif */
+            GMX_FJSP_TRANSPOSE2_V2R8(ewtabV,ewtabFn);
+            felec            = _fjsp_madd_v2r8(eweps,ewtabD,ewtabF);
+            /*                 #define INNERFLOPS INNERFLOPS+2 */
+            /*                 #if KERNEL_MOD_ELEC=='PotentialShift' */            
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq{I}{J},_fjsp_sub_v2r8(_fjsp_sub_v2r8(rinv{I}{J},sh_ewald),velec));
+            /*                     #define INNERFLOPS INNERFLOPS+7 */
+            /*                 #else */
+            velec            = _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(ewtabhalfspace,eweps) ,_fjsp_add_v2r8(ewtabF,felec), ewtabV);
+            velec            = _fjsp_mul_v2r8(qq{I}{J},_fjsp_sub_v2r8(rinv{I}{J},velec));
+            /*                     #define INNERFLOPS INNERFLOPS+6 */
+            /*                 #endif */
+            /*                 #if 'Force' in KERNEL_VF */
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq{I}{J},rinv{I}{J}),_fjsp_sub_v2r8(rinvsq{I}{J},felec));
+            /*                      #define INNERFLOPS INNERFLOPS+3 */
+            /*                 #endif */
+            /*             #elif KERNEL_VF=='Force' */
+            /*                 #if ROUND == 'Loop' */
+            gmx_fjsp_load_2pair_swizzle_v2r8(ewtab+ewconv.i[0],ewtab+ewconv.i[1],
+                                         &ewtabF,&ewtabFn);
+            /*                 #else */
+            gmx_fjsp_load_1pair_swizzle_v2r8(ewtab+ewconv.i[0],&ewtabF,&ewtabFn);
+            /*                 #endif */
+            felec            = _fjsp_madd_v2r8(eweps,ewtabFn,_fjsp_nmsub_v2r8(eweps,ewtabF,ewtabF));
+            felec            = _fjsp_mul_v2r8(_fjsp_mul_v2r8(qq{I}{J},rinv{I}{J}),_fjsp_sub_v2r8(rinvsq{I}{J},felec));
+            /*                 #define INNERFLOPS INNERFLOPS+7 */
+            /*             #endif */
+
+            /*         #elif KERNEL_ELEC=='CubicSplineTable' */
+
+            /* CUBIC SPLINE TABLE ELECTROSTATICS */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            /*             #if ROUND == 'Loop' */
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            /*             #else */
+            F                = _fjsp_setzero_v2r8();
+            /*             #endif */
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] +2);
+            /*             #if ROUND == 'Loop' */
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] +2);
+            /*             #else */
+            H                = _fjsp_setzero_v2r8();
+            /*             #endif */
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(vfeps,H,G),F);
+            /*             #define INNERFLOPS INNERFLOPS+4 */
+            /*             #if 'Potential' in KERNEL_VF */
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            velec            = _fjsp_mul_v2r8(qq{I}{J},VV);
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+            /*             #endif */
+            /*             #if 'Force' in KERNEL_VF */
+            FF               = _fjsp_madd_v2r8(_fjsp_madd_v2r8(twovfeps,H,G),vfeps,Fp);
+            felec            = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_mul_v2r8(qq{I}{J},FF),_fjsp_mul_v2r8(vftabscale,rinv{I}{J})));
+            /*                 #define INNERFLOPS INNERFLOPS+7 */
+            /*             #endif */
+            /*         #endif */
+            /*         ## End of check for electrostatics interaction forms */
+            /*     #endif */
+            /*     ## END OF ELECTROSTATIC INTERACTION CHECK FOR PAIR I-J */
+
+            /*     #if 'vdw' in INTERACTION_FLAGS[I][J] */
+
+            /*         #if KERNEL_VDW=='LennardJones' */
+
+            /* LENNARD-JONES DISPERSION/REPULSION */
+
+            rinvsix          = _fjsp_mul_v2r8(_fjsp_mul_v2r8(rinvsq{I}{J},rinvsq{I}{J}),rinvsq{I}{J});
+            /*             #define INNERFLOPS INNERFLOPS+2 */
+            /*             #if 'Potential' in KERNEL_VF or KERNEL_MOD_VDW=='PotentialSwitch' */
+            vvdw6            = _fjsp_mul_v2r8(c6_{I}{J},rinvsix);
+            vvdw12           = _fjsp_mul_v2r8(c12_{I}{J},_fjsp_mul_v2r8(rinvsix,rinvsix));
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+            /*                 #if KERNEL_MOD_VDW=='PotentialShift' */
+            vvdw             = _fjsp_msub_v2r8(_fjsp_nmsub_v2r8(c12_{I}{J},_fjsp_mul_v2r8(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
+                                           _fjsp_mul_v2r8(_fjsp_nmsub_v2r8( c6_{I}{J},sh_vdw_invrcut6,vvdw6),one_sixth));
+            /*                     #define INNERFLOPS INNERFLOPS+8 */
+            /*                 #else */
+            vvdw             = _fjsp_msub_v2r8( vvdw12,one_twelfth, _fjsp_mul_v2r8(vvdw6,one_sixth) );
+            /*                     #define INNERFLOPS INNERFLOPS+3 */
+            /*                 #endif */
+            /*                 ## Check for force inside potential check, i.e. this means we already did the potential part */
+            /*                 #if 'Force' in KERNEL_VF */
+            fvdw             = _fjsp_mul_v2r8(_fjsp_sub_v2r8(vvdw12,vvdw6),rinvsq{I}{J});
+            /*                     #define INNERFLOPS INNERFLOPS+2 */
+            /*                 #endif */
+            /*             #elif KERNEL_VF=='Force' */
+            /*                 ## Force-only LennardJones makes it possible to save 1 flop (they do add up...) */
+            fvdw             = _fjsp_mul_v2r8(_fjsp_msub_v2r8(c12_{I}{J},rinvsix,c6_{I}{J}),_fjsp_mul_v2r8(rinvsix,rinvsq{I}{J}));
+            /*                 #define INNERFLOPS INNERFLOPS+4 */
+            /*             #endif */
+
+            /*         #elif KERNEL_VDW=='CubicSplineTable' */
+
+            /* CUBIC SPLINE TABLE DISPERSION */
+            /*             #if 'Table' in KERNEL_ELEC */
+            vfconv.i[0]       += 4;
+            vfconv.i[1]       += 4;
+            /*             #endif                     */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] );
+            /*             #if ROUND == 'Loop' */
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] );
+            /*             #else */
+            F                = _fjsp_setzero_v2r8();
+            /*             #endif */
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 2 );
+            /*             #if ROUND == 'Loop' */
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 2 );
+            /*             #else */
+            H                = _fjsp_setzero_v2r8();
+            /*             #endif */
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            /*             #define INNERFLOPS INNERFLOPS+4 */
+            /*             #if 'Potential' in KERNEL_VF */
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw6            = _fjsp_mul_v2r8(c6_{I}{J},VV);
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+            /*             #endif */
+            /*             #if 'Force' in KERNEL_VF */
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw6            = _fjsp_mul_v2r8(c6_{I}{J},FF);
+            /*                 #define INNERFLOPS INNERFLOPS+4 */
+            /*             #endif */
+
+            /* CUBIC SPLINE TABLE REPULSION */
+            Y                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 4 );
+            /*             #if ROUND == 'Loop' */
+            F                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 4 );
+            /*             #else */
+            F                = _fjsp_setzero_v2r8();
+            /*             #endif */
+            GMX_FJSP_TRANSPOSE2_V2R8(Y,F);
+            G                = _fjsp_load_v2r8( vftab + vfconv.i[0] + 6 );
+            /*             #if ROUND == 'Loop' */
+            H                = _fjsp_load_v2r8( vftab + vfconv.i[1] + 6 );
+            /*             #else */
+            H                = _fjsp_setzero_v2r8();
+            /*             #endif */
+            GMX_FJSP_TRANSPOSE2_V2R8(G,H);
+            Fp               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(H,vfeps,G),F);
+            /*             #define INNERFLOPS INNERFLOPS+4 */
+            /*             #if 'Potential' in KERNEL_VF */
+            VV               = _fjsp_madd_v2r8(vfeps,Fp,Y);
+            vvdw12           = _fjsp_mul_v2r8(c12_{I}{J},VV);
+            /*                 #define INNERFLOPS INNERFLOPS+3 */
+            /*             #endif */
+            /*             #if 'Force' in KERNEL_VF */
+            FF               = _fjsp_madd_v2r8(vfeps,_fjsp_madd_v2r8(twovfeps,H,G),Fp);
+            fvdw12           = _fjsp_mul_v2r8(c12_{I}{J},FF);
+            /*                 #define INNERFLOPS INNERFLOPS+5 */
+            /*             #endif */
+            /*             #if 'Potential' in KERNEL_VF */
+            vvdw             = _fjsp_add_v2r8(vvdw12,vvdw6);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif */
+            /*             #if 'Force' in KERNEL_VF */
+            fvdw             = _fjsp_neg_v2r8(_fjsp_mul_v2r8(_fjsp_add_v2r8(fvdw6,fvdw12),_fjsp_mul_v2r8(vftabscale,rinv{I}{J})));
+            /*                 #define INNERFLOPS INNERFLOPS+4 */
+            /*             #endif */
+            /*         #endif */
+            /*         ## End of check for vdw interaction forms */
+            /*     #endif */
+            /*     ## END OF VDW INTERACTION CHECK FOR PAIR I-J */
+
+            /*     #if 'switch' in INTERACTION_FLAGS[I][J] */
+            d                = _fjsp_sub_v2r8(r{I}{J},rswitch);
+            d                = _fjsp_max_v2r8(d,_fjsp_setzero_v2r8());
+            d2               = _fjsp_mul_v2r8(d,d);
+            sw               = _fjsp_add_v2r8(one,_fjsp_mul_v2r8(d2,_fjsp_mul_v2r8(d,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swV5,swV4),swV3))));
+            /*         #define INNERFLOPS INNERFLOPS+10 */
+
+            /*         #if 'Force' in KERNEL_VF */
+            dsw              = _fjsp_mul_v2r8(d2,_fjsp_madd_v2r8(d,_fjsp_madd_v2r8(d,swF4,swF3),swF2));
+            /*             #define INNERFLOPS INNERFLOPS+5 */
+            /*         #endif */
+
+            /* Evaluate switch function */
+            /*         #if 'Force' in KERNEL_VF */
+            /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
+            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
+            felec            = _fjsp_msub_v2r8( felec,sw , _fjsp_mul_v2r8(rinv{I}{J},_fjsp_mul_v2r8(velec,dsw)) );
+            /*                 #define INNERFLOPS INNERFLOPS+4 */
+            /*             #endif */
+            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
+            fvdw             = _fjsp_msub_v2r8( fvdw,sw , _fjsp_mul_v2r8(rinv{I}{J},_fjsp_mul_v2r8(vvdw,dsw)) );
+            /*                 #define INNERFLOPS INNERFLOPS+4 */
+            /*             #endif */
+            /*         #endif */
+            /*         #if 'Potential' in KERNEL_VF */
+            /*             #if 'electrostatics' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_ELEC=='PotentialSwitch' */
+            velec            = _fjsp_mul_v2r8(velec,sw);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif */
+            /*             #if 'vdw' in INTERACTION_FLAGS[I][J] and KERNEL_MOD_VDW=='PotentialSwitch' */
+            vvdw             = _fjsp_mul_v2r8(vvdw,sw);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif */
+            /*         #endif */
+            /*     #endif */
+            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            cutoff_mask      = _fjsp_cmplt_v2r8(rsq{I}{J},rcutoff2);
+            /*         #define INNERFLOPS INNERFLOPS+1 */
+            /*     #endif */
+
+            /*     #if 'Potential' in KERNEL_VF */
+            /* Update potential sum for this i atom from the interaction with this j atom. */
+            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] */
+            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            velec            = _fjsp_and_v2r8(velec,cutoff_mask);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif                                       */
+            /*             #if ROUND == 'Epilogue' */
+            velec            = _fjsp_unpacklo_v2r8(velec,_fjsp_setzero_v2r8());
+            /*             #endif */
+            velecsum         = _fjsp_add_v2r8(velecsum,velec);
+            /*             #define INNERFLOPS INNERFLOPS+1 */
+            /*             #if KERNEL_ELEC=='GeneralizedBorn' */
+            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            vgb              = _fjsp_and_v2r8(vgb,cutoff_mask);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif                                       */
+            /*             #if ROUND == 'Epilogue' */
+            vgb              = _fjsp_unpacklo_v2r8(vgb,_fjsp_setzero_v2r8());
+            /*             #endif */
+            vgbsum           = _fjsp_add_v2r8(vgbsum,vgb);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif */
+            /*         #endif */
+            /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
+            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            vvdw             = _fjsp_and_v2r8(vvdw,cutoff_mask);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif                                       */
+            /*             #if ROUND == 'Epilogue' */
+            vvdw             = _fjsp_unpacklo_v2r8(vvdw,_fjsp_setzero_v2r8());
+            /*             #endif */
+            vvdwsum          = _fjsp_add_v2r8(vvdwsum,vvdw);
+            /*             #define INNERFLOPS INNERFLOPS+1 */
+            /*         #endif */
+            /*     #endif */
+
+            /*     #if 'Force' in KERNEL_VF */
+
+            /*         #if 'electrostatics' in INTERACTION_FLAGS[I][J] and 'vdw' in INTERACTION_FLAGS[I][J] */
+            fscal            = _fjsp_add_v2r8(felec,fvdw);
+            /*             #define INNERFLOPS INNERFLOPS+1 */
+            /*         #elif 'electrostatics' in INTERACTION_FLAGS[I][J] */
+            fscal            = felec;
+            /*         #elif 'vdw' in INTERACTION_FLAGS[I][J] */
+            fscal            = fvdw;
+            /*        #endif */
+
+            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            fscal            = _fjsp_and_v2r8(fscal,cutoff_mask);
+            /*                 #define INNERFLOPS INNERFLOPS+1 */
+            /*             #endif                                       */
+
+            /*             #if ROUND == 'Epilogue' */
+            fscal            = _fjsp_unpacklo_v2r8(fscal,_fjsp_setzero_v2r8());
+            /*             #endif */
+
+            /* ## Construction of vectorial force built into FMA instructions now */
+            /* #define INNERFLOPS INNERFLOPS+3      */
+            
+            /* Update vectorial force */
+            fix{I}             = _fjsp_madd_v2r8(dx{I}{J},fscal,fix{I});
+            fiy{I}             = _fjsp_madd_v2r8(dy{I}{J},fscal,fiy{I});
+            fiz{I}             = _fjsp_madd_v2r8(dz{I}{J},fscal,fiz{I});
+            /*             #define INNERFLOPS INNERFLOPS+6 */
+            
+            /* #if GEOMETRY_I == 'Particle'             */
+            /*     #if ROUND == 'Loop' */
+            gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fscal,dx{I}{J},dy{I}{J},dz{I}{J});
+            /*     #else */
+            gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fscal,dx{I}{J},dy{I}{J},dz{I}{J});
+            /*     #endif */
+            /*     #define INNERFLOPS INNERFLOPS+3      */
+            /* #else                                    */
+            fjx{J}             = _fjsp_madd_v2r8(dx{I}{J},fscal,fjx{J});
+            fjy{J}             = _fjsp_madd_v2r8(dy{I}{J},fscal,fjy{J});
+            fjz{J}             = _fjsp_madd_v2r8(dz{I}{J},fscal,fjz{J});
+            /*     #define INNERFLOPS INNERFLOPS+3      */
+            /* #endif                                   */
+
+            /*     #endif */
+
+            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            /*         #if 0    ## This and next two lines is a hack to maintain indentation in template file */
+            {
+                /*     #endif */
+            }
+            /*     #endif */
+            /*    ## End of check for the interaction being outside the cutoff */
+
+            /* #endfor */
+            /* ## End of loop over i-j interaction pairs */
+
+            /* #if 'Water' in GEOMETRY_I and GEOMETRY_J == 'Particle' */
+            /*     #if ROUND == 'Loop' */
+            gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
+            /*     #else */
+            gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0);
+            /*     #endif */
+            /*     #define INNERFLOPS INNERFLOPS+3      */
+            /* #elif GEOMETRY_J == 'Water3'             */
+            /*     #if ROUND == 'Loop' */
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+            /*     #else */
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
+            /*     #endif */
+            /*     #define INNERFLOPS INNERFLOPS+9      */
+            /* #elif GEOMETRY_J == 'Water4'             */
+            /*     #if 0 in PARTICLES_J                 */
+            /*         #if ROUND == 'Loop' */
+            gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+            /*         #else */
+            gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+            /*         #endif */
+            /*         #define INNERFLOPS INNERFLOPS+12 */
+            /*     #else                                */
+            /*         #if ROUND == 'Loop' */
+            gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+            /*         #else */
+            gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
+            /*         #endif */
+            /*         #define INNERFLOPS INNERFLOPS+9  */
+            /*     #endif                               */
+            /* #endif                                   */
+
+            /* Inner loop uses {INNERFLOPS} flops */
+        }
+
+        /* #endfor */
+
+        /* End of innermost loop */
+
+        /* #if 'Force' in KERNEL_VF */
+        /*     #if GEOMETRY_I == 'Particle'            */
+        gmx_fjsp_update_iforce_1atom_swizzle_v2r8(fix0,fiy0,fiz0,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+        /*         #define OUTERFLOPS OUTERFLOPS+6     */
+        /*     #elif GEOMETRY_I == 'Water3'            */
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+        /*         #define OUTERFLOPS OUTERFLOPS+18    */
+        /*     #elif GEOMETRY_I == 'Water4'            */
+        /*         #if 0 in PARTICLES_I                */
+        gmx_fjsp_update_iforce_4atom_swizzle_v2r8(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset,fshift+i_shift_offset);
+        /*             #define OUTERFLOPS OUTERFLOPS+24    */
+        /*         #else                               */
+        gmx_fjsp_update_iforce_3atom_swizzle_v2r8(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
+                                              f+i_coord_offset+DIM,fshift+i_shift_offset);
+        /*             #define OUTERFLOPS OUTERFLOPS+18    */
+        /*         #endif                              */
+        /*     #endif                                  */
+        /* #endif                                      */
+
+        /* #if 'Potential' in KERNEL_VF */
+        ggid                        = gid[iidx];
+        /* Update potential energies */
+        /*     #if KERNEL_ELEC != 'None' */
+        gmx_fjsp_update_1pot_v2r8(velecsum,kernel_data->energygrp_elec+ggid);
+        /*         #define OUTERFLOPS OUTERFLOPS+1 */
+        /*     #endif */
+        /*     #if 'GeneralizedBorn' in KERNEL_ELEC */
+        gmx_fjsp_update_1pot_v2r8(vgbsum,kernel_data->energygrp_polarization+ggid);
+        /*         #define OUTERFLOPS OUTERFLOPS+1 */
+        /*     #endif */
+        /*     #if KERNEL_VDW != 'None' */
+        gmx_fjsp_update_1pot_v2r8(vvdwsum,kernel_data->energygrp_vdw+ggid);
+        /*         #define OUTERFLOPS OUTERFLOPS+1 */
+        /*     #endif */
+        /* #endif */
+        /*     #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */
+        dvdasum = _fjsp_mul_v2r8(dvdasum, _fjsp_mul_v2r8(isai{I},isai{I}));
+        gmx_fjsp_update_1pot_v2r8(dvdasum,dvda+inr);
+        /*     #endif */
+
+        /* Increment number of inner iterations */
+        inneriter                  += j_index_end - j_index_start;
+
+        /* Outer loop uses {OUTERFLOPS} flops */
+    }
+
+    /* Increment number of outer iterations */
+    outeriter        += nri;
+
+    /* Update outer/inner flops */
+    /* ## NB: This is not important, it just affects the flopcount. However, since our preprocessor is */
+    /* ## primitive and replaces aggressively even in strings inside these directives, we need to      */
+    /* ## assemble the main part of the name (containing KERNEL/ELEC/VDW) directly in the source.      */
+    /* #if GEOMETRY_I == 'Water3'            */
+    /*     #define ISUFFIX '_W3'             */
+    /* #elif GEOMETRY_I == 'Water4'          */
+    /*     #define ISUFFIX '_W4'             */
+    /* #else                                 */
+    /*     #define ISUFFIX ''                */
+    /* #endif                                */
+    /* #if GEOMETRY_J == 'Water3'            */
+    /*     #define JSUFFIX 'W3'              */
+    /* #elif GEOMETRY_J == 'Water4'          */
+    /*     #define JSUFFIX 'W4'              */
+    /* #else                                 */
+    /*     #define JSUFFIX ''                */
+    /* #endif                                */
+    /* #if 'PotentialAndForce' in KERNEL_VF  */
+    /*     #define VFSUFFIX  '_VF'           */
+    /* #elif 'Potential' in KERNEL_VF        */
+    /*     #define VFSUFFIX '_V'             */
+    /* #else                                 */
+    /*     #define VFSUFFIX '_F'             */
+    /* #endif                                */
+
+    /* #if KERNEL_ELEC != 'None' and KERNEL_VDW != 'None' */
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
+    /* #elif KERNEL_ELEC != 'None' */
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
+    /* #else */
+    inc_nrnb(nrnb,eNR_NBKERNEL_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS});
+    /* #endif  */
+}
index 45288ee6f40e797f54fc229536a11be0131ff96d..0a576ad0754561b5b8baad5b23b1a2c7bc7eeca4 100644 (file)
@@ -98,6 +98,9 @@
 #if (defined GMX_CPU_ACCELERATION_X86_AVX_256 && defined GMX_DOUBLE)
 #    include "nb_kernel_avx_256_double/nb_kernel_avx_256_double.h"
 #endif
+#if (defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE && defined GMX_DOUBLE)
+#    include "nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h"
+#endif
 
 
 #ifdef GMX_THREAD_MPI
@@ -150,6 +153,9 @@ gmx_nonbonded_setup(FILE *         fplog,
 #endif
 #if (defined GMX_CPU_ACCELERATION_X86_AVX_256 && defined GMX_DOUBLE)
                 nb_kernel_list_add_kernels(kernellist_avx_256_double, kernellist_avx_256_double_size);
+#endif
+#if (defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE && defined GMX_DOUBLE)
+                nb_kernel_list_add_kernels(kernellist_sparc64_hpc_ace_double,kernellist_sparc64_hpc_ace_double_size);
 #endif
                 ; /* empty statement to avoid a completely empty block */
             }
@@ -215,6 +221,10 @@ gmx_nonbonded_set_kernel_pointers(FILE *log, t_nblist *nl)
 #if (defined GMX_CPU_ACCELERATION_X86_SSE4_1 && defined GMX_DOUBLE)
         /* No padding - see comment above */
         { "sse4_1_double", 1 },
+#endif
+#if (defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE && defined GMX_DOUBLE)
+        /* No padding - see comment above */
+        { "sparc64_hpc_ace_double", 1 },
 #endif
         { "c", 1 },
     };